diff options
author | David S. Miller <davem@davemloft.net> | 2015-07-15 21:39:40 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-07-15 21:39:40 -0700 |
commit | 0d0578815bac634aef6e64794e7ad2473fc7af44 (patch) | |
tree | 7f28ceeeac184512de8789d4cb1898ff0bb33240 | |
parent | 07e6a97da1eba064bb35cfd5c121e90865393a60 (diff) | |
parent | c305524617dcd617d698dfe2682f3212e698f781 (diff) | |
download | op-kernel-dev-0d0578815bac634aef6e64794e7ad2473fc7af44.zip op-kernel-dev-0d0578815bac634aef6e64794e7ad2473fc7af44.tar.gz |
Merge branch 'protodown'
Anuradha Karuppiah says:
====================
net: Introduce protodown flag.
User space daemons can detect errors in the network that need to be
notified to the switch device drivers.
Drivers can react to this error state by doing a phy-down on the
switch-port which would result in a carrier-off locally and on the directly
connected switch. Doing that would prevent loops and black-holes in the
network.
One such use case is the multi-chassis LAG application -
1. The MLAG application runs on peer switches (say Switch0 and Switch1)
synchronizing states, forwarding entries etc. between the two
switches over the peer-link (this is a link directly connecting the
two switches).
2. An MLAG election process designates one of the switches as a primary
(for e.g. Switch0 is primary and Switch1 is secondary).
3. The peer link plays a critical role in allowing Switch0-Switch1 to
function as a single LAG partner to the downstream dual-connected
servers. When the peer-link between the switches goes down we have a
split-brain situation. Switch0 and Switch1 are no longer in sync and
are acting independently. This can result in traffic loops and
traffic black-holing in the network.
4. To prevent these problems the MLAG application on the secondary
switch phy-downs the MLAG ports on detecting the peer-link down.
This will be seen as a carrier down on servers that are
dual-connected to Switch0 and Switch1.
5. Specifically a dual-connected server will see a carrier-down on the
port connected to the MLAG secondary, Switch1, and will stop using
that port for traffic TX. So traffic black holing is prevented.
v6 to v7:
Removed some unnecessary code in response to review comments.
v5 to v6:
Replaced proto_flags with a simple proto_down boolean attribute in
response to Dave's comments.
v4 to v5:
Changed the ip link display format for protodown to match the set as
recommended by Stephen.
v3 to v4:
I have moved protodown out of IFF_XXX and introduced a separate
proto_flags field with IF_PROTOF_DOWN bit being used by apps to notify
switch port errors. This is in response to Stephen's comments that
adding a new IFF_XXX may break user space.
I have used rocker as the sample switch driver. And to test this
functionality I used the qemu-rocker patch that Scott sent out in
response to the v3 posting (needed to set link up/down when phy is
enabled/disabled).
v1 to v2:
Based on Dave's suggestion I have moved out aggregating of error bits
across applications to a user space framework. This patch now simply
notifies an aggregated error bit to drivers enabling them to handle
the error gracefully.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/ethernet/rocker/rocker.c | 15 | ||||
-rw-r--r-- | include/linux/netdevice.h | 14 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 1 | ||||
-rw-r--r-- | net/core/dev.c | 20 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 14 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 16 |
6 files changed, 77 insertions, 3 deletions
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index c005167..9324283 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4015,7 +4015,8 @@ static int rocker_port_open(struct net_device *dev) napi_enable(&rocker_port->napi_tx); napi_enable(&rocker_port->napi_rx); - rocker_port_set_enable(rocker_port, true); + if (!dev->proto_down) + rocker_port_set_enable(rocker_port, true); netif_start_queue(dev); return 0; @@ -4227,6 +4228,17 @@ static int rocker_port_get_phys_port_name(struct net_device *dev, return err ? -EOPNOTSUPP : 0; } +static int rocker_port_change_proto_down(struct net_device *dev, + bool proto_down) +{ + struct rocker_port *rocker_port = netdev_priv(dev); + + if (rocker_port->dev->flags & IFF_UP) + rocker_port_set_enable(rocker_port, !proto_down); + rocker_port->dev->proto_down = proto_down; + return 0; +} + static const struct net_device_ops rocker_port_netdev_ops = { .ndo_open = rocker_port_open, .ndo_stop = rocker_port_stop, @@ -4240,6 +4252,7 @@ static const struct net_device_ops rocker_port_netdev_ops = { .ndo_fdb_del = switchdev_port_fdb_del, .ndo_fdb_dump = switchdev_port_fdb_dump, .ndo_get_phys_port_name = rocker_port_get_phys_port_name, + .ndo_change_proto_down = rocker_port_change_proto_down, }; /******************** diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e20979d..45cfd79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1041,6 +1041,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. + * void (*ndo_change_proto_down)(struct net_device *dev, + * bool proto_down); + * This function is used to pass protocol port error state information + * to the switch driver. The switch driver can react to the proto_down + * by doing a phys down on the associated switch port. + * */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1211,6 +1217,8 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); + int (*ndo_change_proto_down)(struct net_device *dev, + bool proto_down); }; /** @@ -1502,6 +1510,10 @@ enum netdev_priv_flags { * * @qdisc_tx_busylock: XXX: need comments on this one * + * @proto_down: protocol port state information can be sent to the + * switch driver and used to set the phys state of the + * switch port. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -1762,6 +1774,7 @@ struct net_device { #endif struct phy_device *phydev; struct lock_class_key *qdisc_tx_busylock; + bool proto_down; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2982,6 +2995,7 @@ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); +int dev_change_proto_down(struct net_device *dev, bool proto_down); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2c7e8e3..24d68b7 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -148,6 +148,7 @@ enum { IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 69445a3..8810b6b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6075,6 +6075,26 @@ int dev_get_phys_port_name(struct net_device *dev, EXPORT_SYMBOL(dev_get_phys_port_name); /** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 18b34d7..194c1d0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool) proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9e433d5..03d61b5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -896,7 +896,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(1); /* IFLA_PROTO_DOWN */ + } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1082,7 +1084,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, (dev->ifalias && nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes))) + atomic_read(&dev->carrier_changes)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; if (1) { @@ -1319,6 +1322,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1858,6 +1862,14 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; + if (tb[IFLA_PROTO_DOWN]) { + err = dev_change_proto_down(dev, + nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) |