diff mbox

[next,1/5] i40e: Introduce devlink interface.

Message ID 1470329387-25138-1-git-send-email-sridhar.samudrala@intel.com
State Changes Requested
Headers show

Commit Message

Samudrala, Sridhar Aug. 4, 2016, 4:49 p.m. UTC
Add initial devlink support to set/get the mode of SRIOV switch.
This patch allows the mode to be set to either 'legacy' or 'switchdev', but
doesn't implement any functionality to create vf representors in switchdev
mode.

With smode support in iproute2 'devlink' utility, switch mode can be set
and get via following commands.

    # devlink dev smode pci/0000:05:00.0
    mode: legacy
    # devlink dev set pci/0000:05:00.0 smode switchdev
    # devlink dev smode pci/0000:05:00.0
    mode: switchdev

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 drivers/net/ethernet/intel/Kconfig          |  1 +
 drivers/net/ethernet/intel/i40e/i40e.h      |  3 ++
 drivers/net/ethernet/intel/i40e/i40e_main.c | 84 ++++++++++++++++++++++++++---
 3 files changed, 80 insertions(+), 8 deletions(-)

Comments

Nambiar, Amritha Aug. 10, 2016, 12:12 a.m. UTC | #1
On 8/4/2016 9:49 AM, Sridhar Samudrala wrote:
> Add initial devlink support to set/get the mode of SRIOV switch.
> This patch allows the mode to be set to either 'legacy' or 'switchdev', but
> doesn't implement any functionality to create vf representors in switchdev
> mode.
>
> With smode support in iproute2 'devlink' utility, switch mode can be set
> and get via following commands.
>
>      # devlink dev smode pci/0000:05:00.0
>      mode: legacy
>      # devlink dev set pci/0000:05:00.0 smode switchdev
>      # devlink dev smode pci/0000:05:00.0
>      mode: switchdev
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
   Acked-by: Amritha Nambiar <amritha.nambiar@intel.com>
> ---
>   drivers/net/ethernet/intel/Kconfig          |  1 +
>   drivers/net/ethernet/intel/i40e/i40e.h      |  3 ++
>   drivers/net/ethernet/intel/i40e/i40e_main.c | 84 ++++++++++++++++++++++++++---
>   3 files changed, 80 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
> index c0e1743..2ede229 100644
> --- a/drivers/net/ethernet/intel/Kconfig
> +++ b/drivers/net/ethernet/intel/Kconfig
> @@ -215,6 +215,7 @@ config I40E
>   	tristate "Intel(R) Ethernet Controller XL710 Family support"
>   	select PTP_1588_CLOCK
>   	depends on PCI
> +	depends on MAY_USE_DEVLINK
>   	---help---
>   	  This driver supports Intel(R) Ethernet Controller XL710 Family of
>   	  devices.  For more information on how to identify your adapter, go
> diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
> index 2a88291..724bd82 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e.h
> @@ -53,6 +53,8 @@
>   #include <linux/clocksource.h>
>   #include <linux/net_tstamp.h>
>   #include <linux/ptp_clock_kernel.h>
> +#include <net/devlink.h>
> +
>   #include "i40e_type.h"
>   #include "i40e_prototype.h"
>   #ifdef I40E_FCOE
> @@ -446,6 +448,7 @@ struct i40e_pf {
>   	u32 ioremap_len;
>   	u32 fd_inv;
>   	u16 phy_led_val;
> +	enum devlink_eswitch_mode eswitch_mode;
>   };
>   
>   enum i40e_filter_state {
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 339d99b..bb44f09 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -27,6 +27,7 @@
>   #include <linux/etherdevice.h>
>   #include <linux/of_net.h>
>   #include <linux/pci.h>
> +#include <net/devlink.h>
>   
>   /* Local includes */
>   #include "i40e.h"
> @@ -10671,6 +10672,58 @@ static void i40e_get_platform_mac_addr(struct pci_dev *pdev, struct i40e_pf *pf)
>   }
>   
>   /**
> + * i40e_devlink_eswitch_mode_get
> + *
> + * @devlink: pointer to devlink struct
> + * @mode: sr-iov switch mode pointer
> + *
> + * Returns the switch mode of the associated PF in the @mode pointer.
> + */
> +static int i40e_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
> +{
> +	struct i40e_pf *pf = devlink_priv(devlink);
> +
> +	*mode = pf->eswitch_mode;
> +
> +	return 0;
> +}
> +
> +/**
> + * i40e_devlink_eswitch_mode_set
> + *
> + * @devlink: pointer to devlink struct
> + * @mode: sr-iov switch mode
> + *
> + * Set the switch mode of the associated PF.
> + * Returns 0 on success and -EOPNOTSUPP on error.
> + */
> +static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
> +{
> +	struct i40e_pf *pf = devlink_priv(devlink);
> +	int err = 0;
> +
> +	if (mode == pf->eswitch_mode)
> +		goto done;
> +
> +	switch (mode) {
> +	case DEVLINK_ESWITCH_MODE_LEGACY:
> +	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
> +		pf->eswitch_mode = mode;
> +		break;
> +	default:
> +		err = -EOPNOTSUPP;
> +	}
> +
> +done:
> +	return err;
> +}
> +
> +static const struct devlink_ops i40e_devlink_ops = {
> +	.eswitch_mode_get = i40e_devlink_eswitch_mode_get,
> +	.eswitch_mode_set = i40e_devlink_eswitch_mode_set,
> +};
> +
> +/**
>    * i40e_probe - Device initialization routine
>    * @pdev: PCI device information struct
>    * @ent: entry in i40e_pci_tbl
> @@ -10687,6 +10740,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>   	struct i40e_pf *pf;
>   	struct i40e_hw *hw;
>   	static u16 pfs_found;
> +	struct devlink *devlink;
>   	u16 wol_nvm_bits;
>   	u16 link_status;
>   	int err;
> @@ -10721,16 +10775,21 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>   	pci_enable_pcie_error_reporting(pdev);
>   	pci_set_master(pdev);
>   
> +	devlink = devlink_alloc(&i40e_devlink_ops, sizeof(*pf));
> +	if (!devlink) {
> +		dev_err(&pdev->dev,
> +			"devlink_alloc failed\n");
> +		err = -ENOMEM;
> +		goto err_devlink_alloc;
> +	}
> +
>   	/* Now that we have a PCI connection, we need to do the
>   	 * low level device setup.  This is primarily setting up
>   	 * the Admin Queue structures and then querying for the
>   	 * device's current profile information.
>   	 */
> -	pf = kzalloc(sizeof(*pf), GFP_KERNEL);
> -	if (!pf) {
> -		err = -ENOMEM;
> -		goto err_pf_alloc;
> -	}
> +	pf = devlink_priv(devlink);
> +
>   	pf->next_vsi = 0;
>   	pf->pdev = pdev;
>   	set_bit(__I40E_DOWN, &pf->state);
> @@ -11047,6 +11106,11 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>   		}
>   	}
>   
> +	pf->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
> +	err = devlink_register(devlink, &pdev->dev);
> +	if (err)
> +		goto err_devlink_register;
> +
>   #ifdef CONFIG_PCI_IOV
>   	/* prep for VF support */
>   	if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
> @@ -11188,6 +11252,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>   	return 0;
>   
>   	/* Unwind what we've done if something failed in the setup */
> +err_devlink_register:
> +	i40e_stop_misc_vector(pf);
>   err_vsis:
>   	set_bit(__I40E_DOWN, &pf->state);
>   	i40e_clear_interrupt_scheme(pf);
> @@ -11205,8 +11271,8 @@ err_adminq_setup:
>   err_pf_reset:
>   	iounmap(hw->hw_addr);
>   err_ioremap:
> -	kfree(pf);
> -err_pf_alloc:
> +	devlink_free(devlink);
> +err_devlink_alloc:
>   	pci_disable_pcie_error_reporting(pdev);
>   	pci_release_selected_regions(pdev,
>   				     pci_select_bars(pdev, IORESOURCE_MEM));
> @@ -11229,9 +11295,11 @@ static void i40e_remove(struct pci_dev *pdev)
>   {
>   	struct i40e_pf *pf = pci_get_drvdata(pdev);
>   	struct i40e_hw *hw = &pf->hw;
> +	struct devlink *devlink = priv_to_devlink(pf);
>   	i40e_status ret_code;
>   	int i;
>   
> +	devlink_unregister(devlink);
>   	i40e_dbg_pf_exit(pf);
>   
>   	i40e_ptp_stop(pf);
> @@ -11319,7 +11387,7 @@ static void i40e_remove(struct pci_dev *pdev)
>   	kfree(pf->vsi);
>   
>   	iounmap(hw->hw_addr);
> -	kfree(pf);
> +	devlink_free(devlink);
>   	pci_release_selected_regions(pdev,
>   				     pci_select_bars(pdev, IORESOURCE_MEM));
>
Alexander Duyck Aug. 10, 2016, 4:01 p.m. UTC | #2
On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> Add initial devlink support to set/get the mode of SRIOV switch.
> This patch allows the mode to be set to either 'legacy' or 'switchdev', but
> doesn't implement any functionality to create vf representors in switchdev
> mode.
>
> With smode support in iproute2 'devlink' utility, switch mode can be set
> and get via following commands.
>
>     # devlink dev smode pci/0000:05:00.0
>     mode: legacy
>     # devlink dev set pci/0000:05:00.0 smode switchdev
>     # devlink dev smode pci/0000:05:00.0
>     mode: switchdev
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>

I really don't see much value in this patch.  If you are going to
support SwitchDev then just do it.  Otherwise you are adding extra
overhead for maintaining two different modes.

I would recommend putting this series out to netdev as an RFC.
Submitting it to intel-wired-lan is kind of pointless as the audience
it to small to get any valuable review.

- Alex
John Fastabend Aug. 10, 2016, 4:50 p.m. UTC | #3
On 16-08-10 09:01 AM, Alexander Duyck wrote:
> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
> <sridhar.samudrala@intel.com> wrote:
>> Add initial devlink support to set/get the mode of SRIOV switch.
>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but
>> doesn't implement any functionality to create vf representors in switchdev
>> mode.
>>
>> With smode support in iproute2 'devlink' utility, switch mode can be set
>> and get via following commands.
>>
>>     # devlink dev smode pci/0000:05:00.0
>>     mode: legacy
>>     # devlink dev set pci/0000:05:00.0 smode switchdev
>>     # devlink dev smode pci/0000:05:00.0
>>     mode: switchdev
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> 
> I really don't see much value in this patch.  If you are going to
> support SwitchDev then just do it.  Otherwise you are adding extra
> overhead for maintaining two different modes.
> 
> I would recommend putting this series out to netdev as an RFC.
> Submitting it to intel-wired-lan is kind of pointless as the audience
> it to small to get any valuable review.
> 
> - Alex

I argued at length about this already. Jiri and company wanted this flag
to push device in and out of this mode. Here we are just following the
already upstreamed and debated decision.

This is less about switchdev and more about generating VF netdevs to
use with ip tools and friends.

Another option would be to just always enable VF netdevs and have no
legacy mode at all. I think that would be fine it just depends on if
you think having extra netdevs around will confuse the stack at all.
It might create a few corner cases but one reasonable thing to do
would be to just fix those cases as they appear.

Thanks,
John
Alexander Duyck Aug. 10, 2016, 6:18 p.m. UTC | #4
On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend
<john.fastabend@gmail.com> wrote:
> On 16-08-10 09:01 AM, Alexander Duyck wrote:
>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
>> <sridhar.samudrala@intel.com> wrote:
>>> Add initial devlink support to set/get the mode of SRIOV switch.
>>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but
>>> doesn't implement any functionality to create vf representors in switchdev
>>> mode.
>>>
>>> With smode support in iproute2 'devlink' utility, switch mode can be set
>>> and get via following commands.
>>>
>>>     # devlink dev smode pci/0000:05:00.0
>>>     mode: legacy
>>>     # devlink dev set pci/0000:05:00.0 smode switchdev
>>>     # devlink dev smode pci/0000:05:00.0
>>>     mode: switchdev
>>>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>
>> I really don't see much value in this patch.  If you are going to
>> support SwitchDev then just do it.  Otherwise you are adding extra
>> overhead for maintaining two different modes.
>>
>> I would recommend putting this series out to netdev as an RFC.
>> Submitting it to intel-wired-lan is kind of pointless as the audience
>> it to small to get any valuable review.
>>
>> - Alex
>
> I argued at length about this already. Jiri and company wanted this flag
> to push device in and out of this mode. Here we are just following the
> already upstreamed and debated decision.

Yeah, I started doing more research after reviewing this patch.  I see
the idea behind it.  I think the issue if anything is that it seems
like things are a bit backwards.  We probably should be enabling the
SwitchDev bits first and then working on adding devlink knobs to
disable things later.

> This is less about switchdev and more about generating VF netdevs to
> use with ip tools and friends.

Right.  One of the issues I have with this patch set is that it seems
to get things backwards.  They are making VFs appear that don't do
much of anything and then trying to bolt on features after the fact.
We probably need to focus on enabling the VF representation, and then
providing the ability to switch them on and off.  Also I would argue
that we should actually be enabling switch features such as FDB
entries instead of trying to bolt on stuff like flow director which
doesn't really apply to very many switches and isn't as likely to be
used on a switch port.

> Another option would be to just always enable VF netdevs and have no
> legacy mode at all. I think that would be fine it just depends on if
> you think having extra netdevs around will confuse the stack at all.
> It might create a few corner cases but one reasonable thing to do
> would be to just fix those cases as they appear.

I'd say we are better off starting out with them just enabled and then
enabling the option to disable them after the fact.  If we are going
to have this extra code floating around we should be defaulting it to
enabled so that it is more likely to be used.  The legacy option
should only really be there so we can turn this off if we don't want
it.

- Alex
John Fastabend Aug. 10, 2016, 6:41 p.m. UTC | #5
On 16-08-10 11:18 AM, Alexander Duyck wrote:
> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> On 16-08-10 09:01 AM, Alexander Duyck wrote:
>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
>>> <sridhar.samudrala@intel.com> wrote:
>>>> Add initial devlink support to set/get the mode of SRIOV switch.
>>>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but
>>>> doesn't implement any functionality to create vf representors in switchdev
>>>> mode.
>>>>
>>>> With smode support in iproute2 'devlink' utility, switch mode can be set
>>>> and get via following commands.
>>>>
>>>>     # devlink dev smode pci/0000:05:00.0
>>>>     mode: legacy
>>>>     # devlink dev set pci/0000:05:00.0 smode switchdev
>>>>     # devlink dev smode pci/0000:05:00.0
>>>>     mode: switchdev
>>>>
>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>
>>> I really don't see much value in this patch.  If you are going to
>>> support SwitchDev then just do it.  Otherwise you are adding extra
>>> overhead for maintaining two different modes.
>>>
>>> I would recommend putting this series out to netdev as an RFC.
>>> Submitting it to intel-wired-lan is kind of pointless as the audience
>>> it to small to get any valuable review.
>>>
>>> - Alex
>>
>> I argued at length about this already. Jiri and company wanted this flag
>> to push device in and out of this mode. Here we are just following the
>> already upstreamed and debated decision.
> 
> Yeah, I started doing more research after reviewing this patch.  I see
> the idea behind it.  I think the issue if anything is that it seems
> like things are a bit backwards.  We probably should be enabling the
> SwitchDev bits first and then working on adding devlink knobs to
> disable things later.
> 

Sure, although its not clear to me exactly which switchdev bits are
useful for an edge NIC like this. Getting switch ids is one thing
that will become useful when we enable multiple bridges.

Otherwise I don't see what l2 switchdev blocks are useful vs
just using the standard ndo op interfaces already in place when
working on a device without learning/aging/etc. The one thing
that I've never bothered to add is pushing "learned" rules down
into the hardware but I'm not convinced for most use cases this
is particularly interesting because you _should_ know in a managed
system what MAC addresses a VM/container/etc is allowed to use
ahead of time via libvirt or other mgmt stack. I haven't tested
the VLAN handling though so that needs to be looked at.

And l3 switchdev routing may be interesting but its fairly
low on my priority list unless someone is really excited about it.

>> This is less about switchdev and more about generating VF netdevs to
>> use with ip tools and friends.
> 
> Right.  One of the issues I have with this patch set is that it seems
> to get things backwards.  They are making VFs appear that don't do
> much of anything and then trying to bolt on features after the fact.
> We probably need to focus on enabling the VF representation, and then
> providing the ability to switch them on and off.  Also I would argue
> that we should actually be enabling switch features such as FDB
> entries instead of trying to bolt on stuff like flow director which
> doesn't really apply to very many switches and isn't as likely to be
> used on a switch port.

Fair enough. Organizing the patches better seems OK to me. I plan to
use the 'tc' offloaded mechanisms not the ethtool flow director
interface for virtual switch offloads.

> 
>> Another option would be to just always enable VF netdevs and have no
>> legacy mode at all. I think that would be fine it just depends on if
>> you think having extra netdevs around will confuse the stack at all.
>> It might create a few corner cases but one reasonable thing to do
>> would be to just fix those cases as they appear.
> 
> I'd say we are better off starting out with them just enabled and then
> enabling the option to disable them after the fact.  If we are going
> to have this extra code floating around we should be defaulting it to
> enabled so that it is more likely to be used.  The legacy option
> should only really be there so we can turn this off if we don't want
> it.
> 

Works for me.


> - Alex
>
Kirsher, Jeffrey T Aug. 10, 2016, 8:50 p.m. UTC | #6
On Thu, 2016-08-04 at 18:49 +0200, Sridhar Samudrala wrote:
> 
> With smode support in iproute2 'devlink' utility, switch mode can be set
> and get via following commands.
> 
>     # devlink dev smode pci/0000:05:00.0
>     mode: legacy
>     # devlink dev set pci/0000:05:00.0 smode switchdev
>     # devlink dev smode pci/0000:05:00.0
>     mode: switchdev
> 
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>  drivers/net/ethernet/intel/Kconfig          |  1 +
>  drivers/net/ethernet/intel/i40e/i40e.h      |  3 ++
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 84
> ++++++++++++++++++++++++++---
>  3 files changed, 80 insertions(+), 8 deletions(-)

Since there was no title patch to this series, I am using patch 1 of the
series to address the whole series.

Since Alex has brought up some good concerns/comments that should be
addressed (and the fact that this series does not apply cleanly to my
current tree), I am dropping this series.  I will await a v2 where Alex's
comments are addressed and the series is rebased against my current dev-
queue branch.
Samudrala, Sridhar Aug. 16, 2016, 6:54 p.m. UTC | #7
On 8/10/2016 11:41 AM, John Fastabend wrote:
> On 16-08-10 11:18 AM, Alexander Duyck wrote:
>> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend
>> <john.fastabend@gmail.com> wrote:
>>> On 16-08-10 09:01 AM, Alexander Duyck wrote:
>>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
>>>> <sridhar.samudrala@intel.com> wrote:
>>>>> Add initial devlink support to set/get the mode of SRIOV switch.
>>>>> This patch allows the mode to be set to either 'legacy' or 'switchdev', but
>>>>> doesn't implement any functionality to create vf representors in switchdev
>>>>> mode.
>>>>>
>>>>> With smode support in iproute2 'devlink' utility, switch mode can be set
>>>>> and get via following commands.
>>>>>
>>>>>      # devlink dev smode pci/0000:05:00.0
>>>>>      mode: legacy
>>>>>      # devlink dev set pci/0000:05:00.0 smode switchdev
>>>>>      # devlink dev smode pci/0000:05:00.0
>>>>>      mode: switchdev
>>>>>
>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>> I really don't see much value in this patch.  If you are going to
>>>> support SwitchDev then just do it.  Otherwise you are adding extra
>>>> overhead for maintaining two different modes.
>>>>
>>>> I would recommend putting this series out to netdev as an RFC.
>>>> Submitting it to intel-wired-lan is kind of pointless as the audience
>>>> it to small to get any valuable review.

As these patches have only changes to i40e driver, i didn't include 
netdev.  Will do so when
i submit v2.

>>>>
>>>> - Alex
>>> I argued at length about this already. Jiri and company wanted this flag
>>> to push device in and out of this mode. Here we are just following the
>>> already upstreamed and debated decision.
>> Yeah, I started doing more research after reviewing this patch.  I see
>> the idea behind it.  I think the issue if anything is that it seems
>> like things are a bit backwards.  We probably should be enabling the
>> SwitchDev bits first and then working on adding devlink knobs to
>> disable things later.
>>
> Sure, although its not clear to me exactly which switchdev bits are
> useful for an edge NIC like this. Getting switch ids is one thing
> that will become useful when we enable multiple bridges.

This patchset is not really adding any switchdev ops.  We are calling 
the mode in which the
PF driver creates VF representors as SWITCHDEV mode (this is based on 
the netdev
discussion of mellanox patches).

Amritha has a patch to add switchdev ops to VF representor netdevs that 
enables returning
switch id via switchdev_port_attr_get() op.
>
> Otherwise I don't see what l2 switchdev blocks are useful vs
> just using the standard ndo op interfaces already in place when
> working on a device without learning/aging/etc. The one thing
> that I've never bothered to add is pushing "learned" rules down
> into the hardware but I'm not convinced for most use cases this
> is particularly interesting because you _should_ know in a managed
> system what MAC addresses a VM/container/etc is allowed to use
> ahead of time via libvirt or other mgmt stack. I haven't tested
> the VLAN handling though so that needs to be looked at.
>
> And l3 switchdev routing may be interesting but its fairly
> low on my priority list unless someone is really excited about it.
>
>>> This is less about switchdev and more about generating VF netdevs to
>>> use with ip tools and friends.
>> Right.  One of the issues I have with this patch set is that it seems
>> to get things backwards.  They are making VFs appear that don't do
>> much of anything and then trying to bolt on features after the fact.
>> We probably need to focus on enabling the VF representation, and then
>> providing the ability to switch them on and off.  Also I would argue
>> that we should actually be enabling switch features such as FDB
>> entries instead of trying to bolt on stuff like flow director which
>> doesn't really apply to very many switches and isn't as likely to be
>> used on a switch port.
> Fair enough. Organizing the patches better seems OK to me. I plan to
> use the 'tc' offloaded mechanisms not the ethtool flow director
> interface for virtual switch offloads.
>
>>> Another option would be to just always enable VF netdevs and have no
>>> legacy mode at all. I think that would be fine it just depends on if
>>> you think having extra netdevs around will confuse the stack at all.
>>> It might create a few corner cases but one reasonable thing to do
>>> would be to just fix those cases as they appear.
>> I'd say we are better off starting out with them just enabled and then
>> enabling the option to disable them after the fact.  If we are going
>> to have this extra code floating around we should be defaulting it to
>> enabled so that it is more likely to be used.  The legacy option
>> should only really be there so we can turn this off if we don't want
>> it.
>>
> Works for me.
>

OK.  If we all agree that 'creating VF netdevs' by default is the right 
way to go,  I will rearrange the
patchset in the following order.
    - Introduce VF representor netdevs (create VF netdevs by default 
when sr-iov VFs are enabled)
    - enable ethtool stats on VF representors
    - enable ntuple filters on VF representors
    - introduce devlink to enable the switch mode to be changed to 
'legacy' (no VF netdevs)

Thanks
Sridhar
Alexander Duyck Aug. 16, 2016, 8:39 p.m. UTC | #8
On Tue, Aug 16, 2016 at 11:54 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> On 8/10/2016 11:41 AM, John Fastabend wrote:
>>
>> On 16-08-10 11:18 AM, Alexander Duyck wrote:
>>>
>>> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend
>>> <john.fastabend@gmail.com> wrote:
>>>>
>>>> On 16-08-10 09:01 AM, Alexander Duyck wrote:
>>>>>
>>>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
>>>>> <sridhar.samudrala@intel.com> wrote:
>>>>>>
>>>>>> Add initial devlink support to set/get the mode of SRIOV switch.
>>>>>> This patch allows the mode to be set to either 'legacy' or
>>>>>> 'switchdev', but
>>>>>> doesn't implement any functionality to create vf representors in
>>>>>> switchdev
>>>>>> mode.
>>>>>>
>>>>>> With smode support in iproute2 'devlink' utility, switch mode can be
>>>>>> set
>>>>>> and get via following commands.
>>>>>>
>>>>>>      # devlink dev smode pci/0000:05:00.0
>>>>>>      mode: legacy
>>>>>>      # devlink dev set pci/0000:05:00.0 smode switchdev
>>>>>>      # devlink dev smode pci/0000:05:00.0
>>>>>>      mode: switchdev
>>>>>>
>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>>>
>>>>> I really don't see much value in this patch.  If you are going to
>>>>> support SwitchDev then just do it.  Otherwise you are adding extra
>>>>> overhead for maintaining two different modes.
>>>>>
>>>>> I would recommend putting this series out to netdev as an RFC.
>>>>> Submitting it to intel-wired-lan is kind of pointless as the audience
>>>>> it to small to get any valuable review.
>
>
> As these patches have only changes to i40e driver, i didn't include netdev.
> Will do so when
> i submit v2.
>
>>>>>
>>>>> - Alex
>>>>
>>>> I argued at length about this already. Jiri and company wanted this flag
>>>> to push device in and out of this mode. Here we are just following the
>>>> already upstreamed and debated decision.
>>>
>>> Yeah, I started doing more research after reviewing this patch.  I see
>>> the idea behind it.  I think the issue if anything is that it seems
>>> like things are a bit backwards.  We probably should be enabling the
>>> SwitchDev bits first and then working on adding devlink knobs to
>>> disable things later.
>>>
>> Sure, although its not clear to me exactly which switchdev bits are
>> useful for an edge NIC like this. Getting switch ids is one thing
>> that will become useful when we enable multiple bridges.
>
>
> This patchset is not really adding any switchdev ops.  We are calling the
> mode in which the
> PF driver creates VF representors as SWITCHDEV mode (this is based on the
> netdev
> discussion of mellanox patches).
>
> Amritha has a patch to add switchdev ops to VF representor netdevs that
> enables returning
> switch id via switchdev_port_attr_get() op.
>
>>
>> Otherwise I don't see what l2 switchdev blocks are useful vs
>> just using the standard ndo op interfaces already in place when
>> working on a device without learning/aging/etc. The one thing
>> that I've never bothered to add is pushing "learned" rules down
>> into the hardware but I'm not convinced for most use cases this
>> is particularly interesting because you _should_ know in a managed
>> system what MAC addresses a VM/container/etc is allowed to use
>> ahead of time via libvirt or other mgmt stack. I haven't tested
>> the VLAN handling though so that needs to be looked at.
>>
>> And l3 switchdev routing may be interesting but its fairly
>> low on my priority list unless someone is really excited about it.
>>
>>>> This is less about switchdev and more about generating VF netdevs to
>>>> use with ip tools and friends.
>>>
>>> Right.  One of the issues I have with this patch set is that it seems
>>> to get things backwards.  They are making VFs appear that don't do
>>> much of anything and then trying to bolt on features after the fact.
>>> We probably need to focus on enabling the VF representation, and then
>>> providing the ability to switch them on and off.  Also I would argue
>>> that we should actually be enabling switch features such as FDB
>>> entries instead of trying to bolt on stuff like flow director which
>>> doesn't really apply to very many switches and isn't as likely to be
>>> used on a switch port.
>>
>> Fair enough. Organizing the patches better seems OK to me. I plan to
>> use the 'tc' offloaded mechanisms not the ethtool flow director
>> interface for virtual switch offloads.
>>
>>>> Another option would be to just always enable VF netdevs and have no
>>>> legacy mode at all. I think that would be fine it just depends on if
>>>> you think having extra netdevs around will confuse the stack at all.
>>>> It might create a few corner cases but one reasonable thing to do
>>>> would be to just fix those cases as they appear.
>>>
>>> I'd say we are better off starting out with them just enabled and then
>>> enabling the option to disable them after the fact.  If we are going
>>> to have this extra code floating around we should be defaulting it to
>>> enabled so that it is more likely to be used.  The legacy option
>>> should only really be there so we can turn this off if we don't want
>>> it.
>>>
>> Works for me.
>>
>
> OK.  If we all agree that 'creating VF netdevs' by default is the right way
> to go,  I will rearrange the
> patchset in the following order.
>    - Introduce VF representor netdevs (create VF netdevs by default when
> sr-iov VFs are enabled)
>    - enable ethtool stats on VF representors
>    - enable ntuple filters on VF representors

No NTUPLE filters.  This is not the way the community will want to do
this, and this is not how I want to do this.

The easy way to think of this is that NTUPLE is just an alias for Rx
network flow classification.  Now where are the VF representors
supposed to receive traffic from?  It is the Tx path of the VF.  So by
putting NTUPLE filters on the VF representors you are implying  you
can get in the Tx path for the VF for all traffic, and last I knew
that was not the case.  As such it makes no sense to do it this way.

>    - introduce devlink to enable the switch mode to be changed to 'legacy'
> (no VF netdevs)
>
> Thanks
> Sridhar

You have to think of the VF representors as switch ports.  They don't
actually represent the VFs.  So if you think about it a switch port
will only have a few real features.  You can control the VLAN
membership, tagged/untagged, what FDB entries are assigned to the
port, the link speed, if the link is up/down, etc.  So some of your
first steps might be to focus on the L1 and L2 type items.  So for
example you might see if we can do things like control the upper limit
of the MTU for the VF by controlling the MTU for the VF representor.
See if you can at least make it so that the MAC and VLAN values for
the VF show up as FDB entries for the VF representor and so forth.

- Alex
Samudrala, Sridhar Aug. 16, 2016, 9:51 p.m. UTC | #9
On 8/16/2016 1:39 PM, Alexander Duyck wrote:
> On Tue, Aug 16, 2016 at 11:54 AM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>> On 8/10/2016 11:41 AM, John Fastabend wrote:
>>> On 16-08-10 11:18 AM, Alexander Duyck wrote:
>>>> On Wed, Aug 10, 2016 at 9:50 AM, John Fastabend
>>>> <john.fastabend@gmail.com> wrote:
>>>>> On 16-08-10 09:01 AM, Alexander Duyck wrote:
>>>>>> On Thu, Aug 4, 2016 at 9:49 AM, Sridhar Samudrala
>>>>>> <sridhar.samudrala@intel.com> wrote:
>>>>>>> Add initial devlink support to set/get the mode of SRIOV switch.
>>>>>>> This patch allows the mode to be set to either 'legacy' or
>>>>>>> 'switchdev', but
>>>>>>> doesn't implement any functionality to create vf representors in
>>>>>>> switchdev
>>>>>>> mode.
>>>>>>>
>>>>>>> With smode support in iproute2 'devlink' utility, switch mode can be
>>>>>>> set
>>>>>>> and get via following commands.
>>>>>>>
>>>>>>>       # devlink dev smode pci/0000:05:00.0
>>>>>>>       mode: legacy
>>>>>>>       # devlink dev set pci/0000:05:00.0 smode switchdev
>>>>>>>       # devlink dev smode pci/0000:05:00.0
>>>>>>>       mode: switchdev
>>>>>>>
>>>>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>>>> I really don't see much value in this patch.  If you are going to
>>>>>> support SwitchDev then just do it.  Otherwise you are adding extra
>>>>>> overhead for maintaining two different modes.
>>>>>>
>>>>>> I would recommend putting this series out to netdev as an RFC.
>>>>>> Submitting it to intel-wired-lan is kind of pointless as the audience
>>>>>> it to small to get any valuable review.
>>
>> As these patches have only changes to i40e driver, i didn't include netdev.
>> Will do so when
>> i submit v2.
>>
>>>>>> - Alex
>>>>> I argued at length about this already. Jiri and company wanted this flag
>>>>> to push device in and out of this mode. Here we are just following the
>>>>> already upstreamed and debated decision.
>>>> Yeah, I started doing more research after reviewing this patch.  I see
>>>> the idea behind it.  I think the issue if anything is that it seems
>>>> like things are a bit backwards.  We probably should be enabling the
>>>> SwitchDev bits first and then working on adding devlink knobs to
>>>> disable things later.
>>>>
>>> Sure, although its not clear to me exactly which switchdev bits are
>>> useful for an edge NIC like this. Getting switch ids is one thing
>>> that will become useful when we enable multiple bridges.
>>
>> This patchset is not really adding any switchdev ops.  We are calling the
>> mode in which the
>> PF driver creates VF representors as SWITCHDEV mode (this is based on the
>> netdev
>> discussion of mellanox patches).
>>
>> Amritha has a patch to add switchdev ops to VF representor netdevs that
>> enables returning
>> switch id via switchdev_port_attr_get() op.
>>
>>> Otherwise I don't see what l2 switchdev blocks are useful vs
>>> just using the standard ndo op interfaces already in place when
>>> working on a device without learning/aging/etc. The one thing
>>> that I've never bothered to add is pushing "learned" rules down
>>> into the hardware but I'm not convinced for most use cases this
>>> is particularly interesting because you _should_ know in a managed
>>> system what MAC addresses a VM/container/etc is allowed to use
>>> ahead of time via libvirt or other mgmt stack. I haven't tested
>>> the VLAN handling though so that needs to be looked at.
>>>
>>> And l3 switchdev routing may be interesting but its fairly
>>> low on my priority list unless someone is really excited about it.
>>>
>>>>> This is less about switchdev and more about generating VF netdevs to
>>>>> use with ip tools and friends.
>>>> Right.  One of the issues I have with this patch set is that it seems
>>>> to get things backwards.  They are making VFs appear that don't do
>>>> much of anything and then trying to bolt on features after the fact.
>>>> We probably need to focus on enabling the VF representation, and then
>>>> providing the ability to switch them on and off.  Also I would argue
>>>> that we should actually be enabling switch features such as FDB
>>>> entries instead of trying to bolt on stuff like flow director which
>>>> doesn't really apply to very many switches and isn't as likely to be
>>>> used on a switch port.
>>> Fair enough. Organizing the patches better seems OK to me. I plan to
>>> use the 'tc' offloaded mechanisms not the ethtool flow director
>>> interface for virtual switch offloads.
>>>
>>>>> Another option would be to just always enable VF netdevs and have no
>>>>> legacy mode at all. I think that would be fine it just depends on if
>>>>> you think having extra netdevs around will confuse the stack at all.
>>>>> It might create a few corner cases but one reasonable thing to do
>>>>> would be to just fix those cases as they appear.
>>>> I'd say we are better off starting out with them just enabled and then
>>>> enabling the option to disable them after the fact.  If we are going
>>>> to have this extra code floating around we should be defaulting it to
>>>> enabled so that it is more likely to be used.  The legacy option
>>>> should only really be there so we can turn this off if we don't want
>>>> it.
>>>>
>>> Works for me.
>>>
>> OK.  If we all agree that 'creating VF netdevs' by default is the right way
>> to go,  I will rearrange the
>> patchset in the following order.
>>     - Introduce VF representor netdevs (create VF netdevs by default when
>> sr-iov VFs are enabled)
>>     - enable ethtool stats on VF representors
>>     - enable ntuple filters on VF representors
> No NTUPLE filters.  This is not the way the community will want to do
> this, and this is not how I want to do this.
>
> The easy way to think of this is that NTUPLE is just an alias for Rx
> network flow classification.  Now where are the VF representors
> supposed to receive traffic from?  It is the Tx path of the VF.  So by
> putting NTUPLE filters on the VF representors you are implying  you
> can get in the Tx path for the VF for all traffic, and last I knew
> that was not the case.  As such it makes no sense to do it this way.

The idea behind NTUPLE filters on VF representors is to match RX traffic that is destined
for the corresponding VF. Basically when adding the flow director filter, we are setting the
dest vsi as the VSI of the corresponding VF.
Without VF representors, this is currently done by overloading user-def value with VF id.


>
>>     - introduce devlink to enable the switch mode to be changed to 'legacy'
>> (no VF netdevs)
>>
>> Thanks
>> Sridhar
> You have to think of the VF representors as switch ports.  They don't
> actually represent the VFs.  So if you think about it a switch port
> will only have a few real features.  You can control the VLAN
> membership, tagged/untagged, what FDB entries are assigned to the
> port, the link speed, if the link is up/down, etc.  So some of your
> first steps might be to focus on the L1 and L2 type items.  So for
> example you might see if we can do things like control the upper limit
> of the MTU for the VF by controlling the MTU for the VF representor.
> See if you can at least make it so that the MAC and VLAN values for
> the VF show up as FDB entries for the VF representor and so forth.

Sure.  These are additional items that we can look into that can be 
enabled via VF representors.
>
> - Alex
Alexander Duyck Aug. 17, 2016, 12:17 a.m. UTC | #10
On Tue, Aug 16, 2016 at 2:51 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> The idea behind NTUPLE filters on VF representors is to match RX traffic
> that is destined
> for the corresponding VF. Basically when adding the flow director filter, we
> are setting the
> dest vsi as the VSI of the corresponding VF.
> Without VF representors, this is currently done by overloading user-def
> value with VF id.
>

Yes, and for now that is preferable to what you are doing since it
makes no sense.

I will try to clarify my point.  First, instead of calling these VF
representors lets call them VF port representor, or VFPR for short
(trying to avoid VFR because it sounds too much like VRF which is
completely unrelated).  So the whole point of a VFPR is to present the
VF as a port on your logical switch.  You should mostly only be
capable of doing on it what you would do with an actual switch.  So
for example if I have a switch connected to NIC.  Now most switches
have certain options for controlling things.  For example I can force
the switch port onto a given VLAN so that the port is isolated from
other traffic.  I can control the max frame size allowed to be sent or
received over the port.  I can block what traffic the port is allowed
to send by restricting MAC addresses via ACLs, and I can manually
program the forwarding table via FDB entries.

The problem is Flow Director also know as NTUPLE or Rx NFC only
operates on the Rx side of NICs.  In your VFPR the Rx side represents
packets coming from your VF, not packets going to it.   If you
transmit on a VFPR the packet should be received only by the VF, if
you receive from it the packet should only be from the VF.  So really
any Flow Director rule you add to the VFPR should not impact the VF,
but instead should impact the VFPR and the PF since the PF will add
the rule and it should only apply on traffic received on the VFPR from
the VF.

At some point in the future we can look at adding flow Tx offloads via
either TC or SwitchDev.  For now adding NTUPLE is a strong no-go.  The
VFPR are not the place to do it unless you are just wanting to add
custom rules to the PF itself that won't impact the VF.

- Alex
John Fastabend Aug. 17, 2016, 7:24 p.m. UTC | #11
On 16-08-16 05:17 PM, Alexander Duyck wrote:
> On Tue, Aug 16, 2016 at 2:51 PM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>> The idea behind NTUPLE filters on VF representors is to match RX traffic
>> that is destined
>> for the corresponding VF. Basically when adding the flow director filter, we
>> are setting the
>> dest vsi as the VSI of the corresponding VF.
>> Without VF representors, this is currently done by overloading user-def
>> value with VF id.
>>
> 
> Yes, and for now that is preferable to what you are doing since it
> makes no sense.
> 
> I will try to clarify my point.  First, instead of calling these VF
> representors lets call them VF port representor, or VFPR for short
> (trying to avoid VFR because it sounds too much like VRF which is
> completely unrelated).  So the whole point of a VFPR is to present the
> VF as a port on your logical switch.  You should mostly only be
> capable of doing on it what you would do with an actual switch.  So
> for example if I have a switch connected to NIC.  Now most switches
> have certain options for controlling things.  For example I can force
> the switch port onto a given VLAN so that the port is isolated from
> other traffic.  I can control the max frame size allowed to be sent or
> received over the port.  I can block what traffic the port is allowed
> to send by restricting MAC addresses via ACLs, and I can manually
> program the forwarding table via FDB entries.
> 
> The problem is Flow Director also know as NTUPLE or Rx NFC only
> operates on the Rx side of NICs.  In your VFPR the Rx side represents
> packets coming from your VF, not packets going to it.   If you
> transmit on a VFPR the packet should be received only by the VF, if
> you receive from it the packet should only be from the VF.  So really
> any Flow Director rule you add to the VFPR should not impact the VF,
> but instead should impact the VFPR and the PF since the PF will add
> the rule and it should only apply on traffic received on the VFPR from
> the VF.
> 
> At some point in the future we can look at adding flow Tx offloads via
> either TC or SwitchDev.  For now adding NTUPLE is a strong no-go.  The
> VFPR are not the place to do it unless you are just wanting to add
> custom rules to the PF itself that won't impact the VF.
> 
> - Alex
> 

FWIW the features I want from the VFPR (nice acronym :) are the
following,

  - xmit (just send it over the PF queues is fine)
  - recv : ideally the driver does this with descriptor magic but
           the next best thing would be to do this in the driver
	   with a lookup on mac when anti-spoofing is on. Or worst
           case if the driver logic is too ugly I'll hack it in
           userspace :(
  - up/down link state
  - statistics
  - not essential but mtu would also be nice

I think thats it to get my basic stuff working in userspace. Another
nice piece would be the fdb_add offloads so we can get a basic l2 bridge
working and bonus points for including the vlan parts of the fdb_add
msg.

Thanks,
John
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index c0e1743..2ede229 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -215,6 +215,7 @@  config I40E
 	tristate "Intel(R) Ethernet Controller XL710 Family support"
 	select PTP_1588_CLOCK
 	depends on PCI
+	depends on MAY_USE_DEVLINK
 	---help---
 	  This driver supports Intel(R) Ethernet Controller XL710 Family of
 	  devices.  For more information on how to identify your adapter, go
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 2a88291..724bd82 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -53,6 +53,8 @@ 
 #include <linux/clocksource.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
+#include <net/devlink.h>
+
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #ifdef I40E_FCOE
@@ -446,6 +448,7 @@  struct i40e_pf {
 	u32 ioremap_len;
 	u32 fd_inv;
 	u16 phy_led_val;
+	enum devlink_eswitch_mode eswitch_mode;
 };
 
 enum i40e_filter_state {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 339d99b..bb44f09 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -27,6 +27,7 @@ 
 #include <linux/etherdevice.h>
 #include <linux/of_net.h>
 #include <linux/pci.h>
+#include <net/devlink.h>
 
 /* Local includes */
 #include "i40e.h"
@@ -10671,6 +10672,58 @@  static void i40e_get_platform_mac_addr(struct pci_dev *pdev, struct i40e_pf *pf)
 }
 
 /**
+ * i40e_devlink_eswitch_mode_get
+ *
+ * @devlink: pointer to devlink struct
+ * @mode: sr-iov switch mode pointer
+ *
+ * Returns the switch mode of the associated PF in the @mode pointer.
+ */
+static int i40e_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+	struct i40e_pf *pf = devlink_priv(devlink);
+
+	*mode = pf->eswitch_mode;
+
+	return 0;
+}
+
+/**
+ * i40e_devlink_eswitch_mode_set
+ *
+ * @devlink: pointer to devlink struct
+ * @mode: sr-iov switch mode
+ *
+ * Set the switch mode of the associated PF.
+ * Returns 0 on success and -EOPNOTSUPP on error.
+ */
+static int i40e_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+	struct i40e_pf *pf = devlink_priv(devlink);
+	int err = 0;
+
+	if (mode == pf->eswitch_mode)
+		goto done;
+
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+		pf->eswitch_mode = mode;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+
+done:
+	return err;
+}
+
+static const struct devlink_ops i40e_devlink_ops = {
+	.eswitch_mode_get = i40e_devlink_eswitch_mode_get,
+	.eswitch_mode_set = i40e_devlink_eswitch_mode_set,
+};
+
+/**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in i40e_pci_tbl
@@ -10687,6 +10740,7 @@  static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct i40e_pf *pf;
 	struct i40e_hw *hw;
 	static u16 pfs_found;
+	struct devlink *devlink;
 	u16 wol_nvm_bits;
 	u16 link_status;
 	int err;
@@ -10721,16 +10775,21 @@  static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	pci_enable_pcie_error_reporting(pdev);
 	pci_set_master(pdev);
 
+	devlink = devlink_alloc(&i40e_devlink_ops, sizeof(*pf));
+	if (!devlink) {
+		dev_err(&pdev->dev,
+			"devlink_alloc failed\n");
+		err = -ENOMEM;
+		goto err_devlink_alloc;
+	}
+
 	/* Now that we have a PCI connection, we need to do the
 	 * low level device setup.  This is primarily setting up
 	 * the Admin Queue structures and then querying for the
 	 * device's current profile information.
 	 */
-	pf = kzalloc(sizeof(*pf), GFP_KERNEL);
-	if (!pf) {
-		err = -ENOMEM;
-		goto err_pf_alloc;
-	}
+	pf = devlink_priv(devlink);
+
 	pf->next_vsi = 0;
 	pf->pdev = pdev;
 	set_bit(__I40E_DOWN, &pf->state);
@@ -11047,6 +11106,11 @@  static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		}
 	}
 
+	pf->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
+	err = devlink_register(devlink, &pdev->dev);
+	if (err)
+		goto err_devlink_register;
+
 #ifdef CONFIG_PCI_IOV
 	/* prep for VF support */
 	if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
@@ -11188,6 +11252,8 @@  static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
 	/* Unwind what we've done if something failed in the setup */
+err_devlink_register:
+	i40e_stop_misc_vector(pf);
 err_vsis:
 	set_bit(__I40E_DOWN, &pf->state);
 	i40e_clear_interrupt_scheme(pf);
@@ -11205,8 +11271,8 @@  err_adminq_setup:
 err_pf_reset:
 	iounmap(hw->hw_addr);
 err_ioremap:
-	kfree(pf);
-err_pf_alloc:
+	devlink_free(devlink);
+err_devlink_alloc:
 	pci_disable_pcie_error_reporting(pdev);
 	pci_release_selected_regions(pdev,
 				     pci_select_bars(pdev, IORESOURCE_MEM));
@@ -11229,9 +11295,11 @@  static void i40e_remove(struct pci_dev *pdev)
 {
 	struct i40e_pf *pf = pci_get_drvdata(pdev);
 	struct i40e_hw *hw = &pf->hw;
+	struct devlink *devlink = priv_to_devlink(pf);
 	i40e_status ret_code;
 	int i;
 
+	devlink_unregister(devlink);
 	i40e_dbg_pf_exit(pf);
 
 	i40e_ptp_stop(pf);
@@ -11319,7 +11387,7 @@  static void i40e_remove(struct pci_dev *pdev)
 	kfree(pf->vsi);
 
 	iounmap(hw->hw_addr);
-	kfree(pf);
+	devlink_free(devlink);
 	pci_release_selected_regions(pdev,
 				     pci_select_bars(pdev, IORESOURCE_MEM));