From cdd38b219eec2e1b83c0a02d89d372f9656648eb Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 18 Sep 2015 11:30:40 +0200 Subject: mac802154: llsec: fix device deletion from list This patch adds a missing list_del when a device description will be deleted. Cc: Phoebe Buckheister Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/llsec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 985e939..7799d3c 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -401,6 +401,7 @@ int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr) hash_del_rcu(&pos->bucket_s); hash_del_rcu(&pos->bucket_hw); + list_del_rcu(&pos->dev.list); call_rcu(&pos->rcu, llsec_dev_free_rcu); return 0; -- cgit v1.1 From a1da67b8117ddbe88c770b48b5b1527393b8c9c0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 18 Sep 2015 11:30:41 +0200 Subject: ieee802154: header_ops: fix frame control setting Sometimes upper-layer protocols wants to generate a new mac header by filling "struct ieee802154_hdr" only. These upper-layers sets for the address settings the source and dest fields, but not the fc fields for indicate the source and dest address mode. This patch changes the "ieee802154_hdr_push" function so the fc address fields are set according the source and dest fields of "struct ieee802154_hdr". Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/header_ops.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c index a051b69..d8443b0 100644 --- a/net/ieee802154/header_ops.c +++ b/net/ieee802154/header_ops.c @@ -83,35 +83,35 @@ ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) } int -ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) +ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { u8 buf[MAC802154_FRAME_HARD_HEADER_LEN]; int pos = 2; int rc; - struct ieee802154_hdr_fc fc = hdr->fc; + struct ieee802154_hdr_fc *fc = &hdr->fc; buf[pos++] = hdr->seq; - fc.dest_addr_mode = hdr->dest.mode; + fc->dest_addr_mode = hdr->dest.mode; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); if (rc < 0) return -EINVAL; pos += rc; - fc.source_addr_mode = hdr->source.mode; + fc->source_addr_mode = hdr->source.mode; if (hdr->source.pan_id == hdr->dest.pan_id && hdr->dest.mode != IEEE802154_ADDR_NONE) - fc.intra_pan = true; + fc->intra_pan = true; - rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc.intra_pan); + rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan); if (rc < 0) return -EINVAL; pos += rc; - if (fc.security_enabled) { - fc.version = 1; + if (fc->security_enabled) { + fc->version = 1; rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); if (rc < 0) @@ -120,7 +120,7 @@ ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr) pos += rc; } - memcpy(buf, &fc, 2); + memcpy(buf, fc, 2); memcpy(skb_push(skb, pos), buf, pos); -- cgit v1.1 From 838b83d63d2909f9136f3030dc4fffa8230c31da Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 18 Sep 2015 11:30:42 +0200 Subject: ieee802154: introduce wpan_dev_header_ops The current header_ops callback structure of net device are used mostly from 802.15.4 upper-layers. Because this callback structure is a very generic one, which is also used by e.g. DGRAM AF_PACKET sockets, we can't make this callback structure 802.15.4 specific which is currently is. I saw the smallest "constraint" for calling this callback with dev_hard_header/dev_parse_header by AF_PACKET which assign a 8 byte array for address void pointers. Currently 802.15.4 specific protocols like af802154 and 6LoWPAN will assign the "struct ieee802154_addr" as these parameters which is greater than 8 bytes. The current callback implementation for header_ops.create assumes always a complete "struct ieee802154_addr" which AF_PACKET can't never handled and is greater than 8 bytes. For that reason we introduce now a "generic" create/parse header_ops callback which allows handling with intra-pan extended addresses only. This allows a small use-case with AF_PACKET to send "somehow" a valid dataframe over DGRAM. To keeping the current dev_hard_header behaviour we introduce a similar callback structure "wpan_dev_header_ops" which contains 802.15.4 specific upper-layer header creation functionality, which can be called by wpan_dev_hard_header. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/tx.c | 8 ++-- net/ieee802154/socket.c | 4 +- net/mac802154/iface.c | 91 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 85 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 54939d0..6067e06 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -87,8 +87,8 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, skb_reset_network_header(frag); *mac_cb(frag) = *mac_cb(skb); - rc = dev_hard_header(frag, wdev, 0, &master_hdr->dest, - &master_hdr->source, size); + rc = wpan_dev_hard_header(frag, wdev, &master_hdr->dest, + &master_hdr->source, size); if (rc < 0) { kfree_skb(frag); return ERR_PTR(rc); @@ -228,8 +228,8 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, cb->ackreq = wpan_dev->ackreq; } - return dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, ETH_P_IPV6, - (void *)&da, (void *)&sa, 0); + return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa, + 0); } netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index b6eacf3..be77f21 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -676,8 +676,8 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; - err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &dst_addr, - ro->bound ? &ro->src_addr : NULL, size); + err = wpan_dev_hard_header(skb, dev, &dst_addr, + ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index ed26952..8afe26d 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -367,12 +367,11 @@ static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, return 0; } -static int mac802154_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, - const void *daddr, - const void *saddr, - unsigned len) +static int ieee802154_header_create(struct sk_buff *skb, + struct net_device *dev, + const struct ieee802154_addr *daddr, + const struct ieee802154_addr *saddr, + unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); @@ -423,24 +422,91 @@ static int mac802154_header_create(struct sk_buff *skb, return hlen; } +static const struct wpan_dev_header_ops ieee802154_header_ops = { + .create = ieee802154_header_create, +}; + +/* This header create functionality assumes a 8 byte array for + * source and destination pointer at maximum. To adapt this for + * the 802.15.4 dataframe header we use extended address handling + * here only and intra pan connection. fc fields are mostly fallback + * handling. For provide dev_hard_header for dgram sockets. + */ +static int mac802154_header_create(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, + const void *saddr, + unsigned len) +{ + struct ieee802154_hdr hdr; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct ieee802154_mac_cb cb = { }; + int hlen; + + if (!daddr) + return -EINVAL; + + memset(&hdr.fc, 0, sizeof(hdr.fc)); + hdr.fc.type = IEEE802154_FC_TYPE_DATA; + hdr.fc.ack_request = wpan_dev->ackreq; + hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; + + /* TODO currently a workaround to give zero cb block to set + * security parameters defaults according MIB. + */ + if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) + return -EINVAL; + + hdr.dest.pan_id = wpan_dev->pan_id; + hdr.dest.mode = IEEE802154_ADDR_LONG; + memcpy(&hdr.dest.extended_addr, daddr, IEEE802154_EXTENDED_ADDR_LEN); + + hdr.source.pan_id = hdr.dest.pan_id; + hdr.source.mode = IEEE802154_ADDR_LONG; + + if (!saddr) + hdr.source.extended_addr = wpan_dev->extended_addr; + else + memcpy(&hdr.source.extended_addr, saddr, + IEEE802154_EXTENDED_ADDR_LEN); + + hlen = ieee802154_hdr_push(skb, &hdr); + if (hlen < 0) + return -EINVAL; + + skb_reset_mac_header(skb); + skb->mac_len = hlen; + + if (len > ieee802154_max_payload(&hdr)) + return -EMSGSIZE; + + return hlen; +} + static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; - struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } - *addr = hdr.source; - return sizeof(*addr); + if (hdr.source.mode == IEEE802154_ADDR_LONG) { + memcpy(haddr, &hdr.source.extended_addr, + IEEE802154_EXTENDED_ADDR_LEN); + return IEEE802154_EXTENDED_ADDR_LEN; + } + + return 0; } -static struct header_ops mac802154_header_ops = { - .create = mac802154_header_create, - .parse = mac802154_header_parse, +static const struct header_ops mac802154_header_ops = { + .create = mac802154_header_create, + .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { @@ -513,6 +579,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; wpan_dev->promiscuous_mode = false; + wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); -- cgit v1.1 From 87a93e4eceb495f93e3f37b100334d2641765b6c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 18 Sep 2015 11:30:43 +0200 Subject: ieee802154: change needed headroom/tailroom This patch cleanups needed_headroom, needed_tailroom and hard_header_len fields for wpan and lowpan interfaces. For wpan interfaces the worst case mac header len should be part of needed_headroom, currently this is set as hard_header_len, but hard_header_len should be set to the minimum header length which xmit call assumes and this is the minimum frame length of 802.15.4. The hard_header_len value will check inside send callbacl of AF_PACKET raw sockets. For lowpan interfaces, if fragmentation isn't needed the skb will call dev_hard_header for 802154 layer and queue it afterwards. This happens without new skb allocation, so we need the same headroom and tailroom lengths like 802154 inside 802154 6lowpan layer. At least we assume as minimum header length an ipv6 header size. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/6lowpan/nhc.h | 2 -- net/ieee802154/6lowpan/core.c | 14 +++++++++++--- net/ieee802154/6lowpan/tx.c | 12 ++++++++++-- net/ieee802154/header_ops.c | 2 +- net/mac802154/iface.c | 17 ++++++++++++++--- net/mac802154/tx.c | 3 --- 6 files changed, 36 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/6lowpan/nhc.h b/net/6lowpan/nhc.h index ed44938..c249f17 100644 --- a/net/6lowpan/nhc.h +++ b/net/6lowpan/nhc.h @@ -8,8 +8,6 @@ #include #include -#define LOWPAN_NHC_MAX_ID_LEN 1 - /** * LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct * diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 9f0cfa5..44420ed 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -104,9 +104,8 @@ static void lowpan_setup(struct net_device *ldev) ldev->addr_len = IEEE802154_ADDR_LEN; memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); ldev->type = ARPHRD_6LOWPAN; - /* Frame Control + Sequence Number + Address fields + Security Header */ - ldev->hard_header_len = 2 + 1 + 20 + 14; - ldev->needed_tailroom = 2; /* FCS */ + /* We need an ipv6hdr as minimum len when calling xmit */ + ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->mtu = IPV6_MIN_MTU; ldev->priv_flags |= IFF_NO_QUEUE; ldev->flags = IFF_BROADCAST | IFF_MULTICAST; @@ -156,6 +155,15 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, lowpan_dev_info(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); + /* We need headroom for possible wpan_dev_hard_header call and tailroom + * for encryption/fcs handling. The lowpan interface will replace + * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN + * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 + * header. + */ + ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + + wdev->needed_headroom; + ldev->needed_tailroom = wdev->needed_tailroom; lowpan_netdev_setup(ldev, LOWPAN_LLTYPE_IEEE802154); diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 6067e06..7e0563e 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -10,6 +10,7 @@ #include #include +#include #include "6lowpan_i.h" @@ -36,6 +37,13 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) sizeof(struct lowpan_addr_info)); } +/* This callback will be called from AF_PACKET and IPv6 stack, the AF_PACKET + * sockets gives an 8 byte array for addresses only! + * + * TODO I think AF_PACKET DGRAM (sending/receiving) RAW (sending) makes no + * sense here. We should disable it, the right use-case would be AF_INET6 + * RAW/DGRAM sockets. + */ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) @@ -77,13 +85,13 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, struct sk_buff *frag; int rc; - frag = alloc_skb(wdev->hard_header_len + wdev->needed_tailroom + size, + frag = alloc_skb(wdev->needed_headroom + wdev->needed_tailroom + size, GFP_ATOMIC); if (likely(frag)) { frag->dev = wdev; frag->priority = skb->priority; - skb_reserve(frag, wdev->hard_header_len); + skb_reserve(frag, wdev->needed_headroom); skb_reset_network_header(frag); *mac_cb(frag) = *mac_cb(skb); diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c index d8443b0..c7439f0 100644 --- a/net/ieee802154/header_ops.c +++ b/net/ieee802154/header_ops.c @@ -85,7 +85,7 @@ ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { - u8 buf[MAC802154_FRAME_HARD_HEADER_LEN]; + u8 buf[IEEE802154_MAX_HEADER_LEN]; int pos = 2; int rc; struct ieee802154_hdr_fc *fc = &hdr->fc; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 8afe26d..b5a0936 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -537,8 +537,18 @@ static void ieee802154_if_setup(struct net_device *dev) dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); - dev->hard_header_len = MAC802154_FRAME_HARD_HEADER_LEN; - dev->needed_tailroom = 2 + 16; /* FCS + MIC */ + /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET + * will not send frames without any payload, but ack frames + * has no payload, so substract one that we can send a 3 bytes + * frame. The xmit callback assumes at least a hard header where two + * bytes fc and sequence field are set. + */ + dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; + /* The auth_tag header is for security and places in private payload + * room of mac frame which stucks between payload and FCS field. + */ + dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + + IEEE802154_FCS_LEN; dev->mtu = IEEE802154_MTU; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; @@ -617,7 +627,8 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, if (!ndev) return ERR_PTR(-ENOMEM); - ndev->needed_headroom = local->hw.extra_tx_headroom; + ndev->needed_headroom = local->hw.extra_tx_headroom + + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 7ed4391..66d7ecb 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -77,9 +77,6 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) put_unaligned_le16(crc, skb_put(skb, 2)); } - if (skb_cow_head(skb, local->hw.extra_tx_headroom)) - goto err_tx; - /* Stop the netif queue on each sub_if_data object. */ ieee802154_stop_queue(&local->hw); -- cgit v1.1 From 02c7b6922899621aa8e8babe27fca7b6b2e497b0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 18 Sep 2015 11:30:44 +0200 Subject: mac802154: tx: add warning if MTU exceeds Sending over AF_PACKET RAW sockets we can sending frames which exceeds MTU size. To handling it correct we need to change things in AF_PACKET which knows on RAW sockets an additional FCS is set by hardware or mac802154 transmit functionality. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/tx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 66d7ecb..5ee596e 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -71,6 +71,17 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) struct net_device *dev = skb->dev; int ret; + /* This check is for AF_PACKET RAW socket only, which doesn't + * know about the FCS which is set here or by hardware. otherwise + * it should not occur in any case! + * + * TODO: This should be handled in AF_PACKET and return -EMSGSIZE. + */ + if (skb->len > IEEE802154_MTU - IEEE802154_FCS_LEN) { + netdev_warn(dev, "Frame len above MTU limit. Dropped.\n"); + goto err_tx; + } + if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) { u16 crc = crc_ccitt(0, skb->data, skb->len); -- cgit v1.1 From 8de1c63ba1ccfa8225505e60b405537c2c72673c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Aug 2015 14:13:06 +0200 Subject: wireless: make __freq_reg_info static As pointed out by sparse, this symbol should be static, make it so. Signed-off-by: Johannes Berg --- net/wireless/reg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b144485..70aef72 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1040,8 +1040,8 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, return ERR_PTR(-EINVAL); } -const struct ieee80211_reg_rule *__freq_reg_info(struct wiphy *wiphy, - u32 center_freq, u32 min_bw) +static const struct ieee80211_reg_rule * +__freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) { const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); const struct ieee80211_reg_rule *reg_rule = NULL; -- cgit v1.1 From fc58c47ef1ace65c5c1c94f2e96578e7b04aad64 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Sat, 15 Aug 2015 22:04:01 +0300 Subject: mac80211: process skb_queue while scanning in HW Queued frames aren't processed during scan, which results in an inability to complete the BA session establishment until the scan ends. Since we can't tx frames until the BA agreement setup is complete, it might result in a very large latency during scan. Fix this by allowing to process queued skbs while scanning in HW. This should be ok since the devices which support hw scan should be able to handle tx/rx while scanning. During SW scan, mac80211 drops any txed frames besides probes and NDPs, so it is still needed to delay processing of the queued frames till the SW scan is done. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6964fc6..42d7f0f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1204,7 +1204,7 @@ static void ieee80211_iface_work(struct work_struct *work) if (!ieee80211_sdata_running(sdata)) return; - if (local->scanning) + if (test_bit(SCAN_SW_SCANNING, &local->scanning)) return; if (!ieee80211_can_run_worker(local)) -- cgit v1.1 From 82c0cc90d6268a3cd3ee058257d2146188326452 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Sat, 15 Aug 2015 22:39:46 +0300 Subject: mac80211: debugfs: add file to disallow TDLS wider-bw Sometimes we are interested in testing TDLS performance in a specific width setting. Add the ability to disable the wider-band feature, thereby allowing the TDLS channel width to be controlled by the BSS width. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + net/mac80211/debugfs_netdev.c | 29 +++++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 1 + net/mac80211/tdls.c | 4 +++- 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 685ec13..1ca972e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1156,6 +1156,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + !sdata->u.mgd.tdls_wider_bw_prohibited && ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && params->ext_capab_len >= 8 && params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 1021e87..f1580e9 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -455,6 +455,34 @@ static ssize_t ieee80211_if_parse_uapsd_max_sp_len( } IEEE80211_IF_FILE_RW(uapsd_max_sp_len); +static ssize_t ieee80211_if_fmt_tdls_wider_bw( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + const struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool tdls_wider_bw; + + tdls_wider_bw = ieee80211_hw_check(&sdata->local->hw, TDLS_WIDER_BW) && + !ifmgd->tdls_wider_bw_prohibited; + + return snprintf(buf, buflen, "%d\n", tdls_wider_bw); +} + +static ssize_t ieee80211_if_parse_tdls_wider_bw( + struct ieee80211_sub_if_data *sdata, const char *buf, int buflen) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + u8 val; + int ret; + + ret = kstrtou8(buf, 0, &val); + if (ret) + return ret; + + ifmgd->tdls_wider_bw_prohibited = !val; + return buflen; +} +IEEE80211_IF_FILE_RW(tdls_wider_bw); + /* AP attributes */ IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC); IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC); @@ -614,6 +642,7 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD_MODE(beacon_loss, 0200); DEBUGFS_ADD_MODE(uapsd_queues, 0600); DEBUGFS_ADD_MODE(uapsd_max_sp_len, 0600); + DEBUGFS_ADD_MODE(tdls_wider_bw, 0600); } static void add_ap_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6e52659..65f4faa 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -535,6 +535,7 @@ struct ieee80211_if_managed { struct sk_buff *teardown_skb; /* A copy to send through the AP */ spinlock_t teardown_lock; /* To lock changing teardown_skb */ bool tdls_chan_switch_prohibited; + bool tdls_wider_bw_prohibited; /* WMM-AC TSPEC support */ struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS]; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index aee701a..1bacea7 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -41,9 +41,11 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool chan_switch = local->hw.wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH; - bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW); + bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && + !ifmgd->tdls_wider_bw_prohibited; enum ieee80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool vht = sband && sband->vht_cap.vht_supported; -- cgit v1.1 From dd55ab59b6234c73522dc533757e89e6a77c2c38 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Sat, 15 Aug 2015 22:39:48 +0300 Subject: mac80211: TDLS: check reg with IR-relax on chandef upgrade When checking if a TDLS chandef can be upgraded, IR-relaxation can be taken into account to allow more channels. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 1bacea7..52f3187 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -333,8 +333,8 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, /* proceed to downgrade the chandef until usable or the same */ while (uc.width > max_width && - !cfg80211_reg_can_beacon(sdata->local->hw.wiphy, - &uc, sdata->wdev.iftype)) + !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, + sdata->wdev.iftype)) ieee80211_chandef_downgrade(&uc); if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) { -- cgit v1.1 From 7bdbe400d1b2aac116513f90b75969ad2365fba6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 15 Aug 2015 22:39:49 +0300 Subject: nl80211: support vendor dumpit commands In order to transfer many items in vendor commands, support the dumpit netlink method for them. Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5d8748b..a4e6c95 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2015 Intel Deutschland GmbH */ #include @@ -9938,6 +9939,9 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) if (!wdev->netdev && !wdev->p2p_started) return -ENETDOWN; } + + if (!vcmd->doit) + return -EOPNOTSUPP; } else { wdev = NULL; } @@ -9957,6 +9961,193 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } +static int nl80211_prepare_vendor_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct wireless_dev **wdev) +{ + u32 vid, subcmd; + unsigned int i; + int vcmd_idx = -1; + int err; + void *data = NULL; + unsigned int data_len = 0; + + rtnl_lock(); + + if (cb->args[0]) { + /* subtract the 1 again here */ + struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); + struct wireless_dev *tmp; + + if (!wiphy) { + err = -ENODEV; + goto out_unlock; + } + *rdev = wiphy_to_rdev(wiphy); + *wdev = NULL; + + if (cb->args[1]) { + list_for_each_entry(tmp, &(*rdev)->wdev_list, list) { + if (tmp->identifier == cb->args[1] - 1) { + *wdev = tmp; + break; + } + } + } + + /* keep rtnl locked in successful case */ + return 0; + } + + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, + nl80211_fam.attrbuf, nl80211_fam.maxattr, + nl80211_policy); + if (err) + goto out_unlock; + + if (!nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID] || + !nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { + err = -EINVAL; + goto out_unlock; + } + + *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), + nl80211_fam.attrbuf); + if (IS_ERR(*wdev)) + *wdev = NULL; + + *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), + nl80211_fam.attrbuf); + if (IS_ERR(*rdev)) { + err = PTR_ERR(*rdev); + goto out_unlock; + } + + vid = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_ID]); + subcmd = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); + + for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) { + const struct wiphy_vendor_command *vcmd; + + vcmd = &(*rdev)->wiphy.vendor_commands[i]; + + if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) + continue; + + if (!vcmd->dumpit) { + err = -EOPNOTSUPP; + goto out_unlock; + } + + vcmd_idx = i; + break; + } + + if (vcmd_idx < 0) { + err = -EOPNOTSUPP; + goto out_unlock; + } + + if (nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]) { + data = nla_data(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]); + data_len = nla_len(nl80211_fam.attrbuf[NL80211_ATTR_VENDOR_DATA]); + } + + /* 0 is the first index - add 1 to parse only once */ + cb->args[0] = (*rdev)->wiphy_idx + 1; + /* add 1 to know if it was NULL */ + cb->args[1] = *wdev ? (*wdev)->identifier + 1 : 0; + cb->args[2] = vcmd_idx; + cb->args[3] = (unsigned long)data; + cb->args[4] = data_len; + + /* keep rtnl locked in successful case */ + return 0; + out_unlock: + rtnl_unlock(); + return err; +} + +static int nl80211_vendor_cmd_dump(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + unsigned int vcmd_idx; + const struct wiphy_vendor_command *vcmd; + void *data; + int data_len; + int err; + struct nlattr *vendor_data; + + err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev); + if (err) + return err; + + vcmd_idx = cb->args[2]; + data = (void *)cb->args[3]; + data_len = cb->args[4]; + vcmd = &rdev->wiphy.vendor_commands[vcmd_idx]; + + if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | + WIPHY_VENDOR_CMD_NEED_NETDEV)) { + if (!wdev) + return -EINVAL; + if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && + !wdev->netdev) + return -EINVAL; + + if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { + if (wdev->netdev && + !netif_running(wdev->netdev)) + return -ENETDOWN; + if (!wdev->netdev && !wdev->p2p_started) + return -ENETDOWN; + } + } + + while (1) { + void *hdr = nl80211hdr_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + NL80211_CMD_VENDOR); + if (!hdr) + break; + + if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + (wdev && nla_put_u64(skb, NL80211_ATTR_WDEV, + wdev_id(wdev)))) { + genlmsg_cancel(skb, hdr); + break; + } + + vendor_data = nla_nest_start(skb, NL80211_ATTR_VENDOR_DATA); + if (!vendor_data) { + genlmsg_cancel(skb, hdr); + break; + } + + err = vcmd->dumpit(&rdev->wiphy, wdev, skb, data, data_len, + (unsigned long *)&cb->args[5]); + nla_nest_end(skb, vendor_data); + + if (err == -ENOBUFS || err == -ENOENT) { + genlmsg_cancel(skb, hdr); + break; + } else if (err) { + genlmsg_cancel(skb, hdr); + goto out; + } + + genlmsg_end(skb, hdr); + } + + err = skb->len; + out: + rtnl_unlock(); + return err; +} + struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy, enum nl80211_commands cmd, enum nl80211_attrs attr, @@ -10994,6 +11185,7 @@ static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_VENDOR, .doit = nl80211_vendor_cmd, + .dumpit = nl80211_vendor_cmd_dump, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | -- cgit v1.1 From 1b09b5568e5f46c6dfb781d7c1dfad431a6d8ec1 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Sat, 15 Aug 2015 22:39:50 +0300 Subject: mac80211: introduce per vif frame registration API Currently the cfg80211's frame registration api receives wdev, however mac80211 assumes per device filter configuration and ignores wdev. Per device filtering is too wasteful, especially for multi-channel devices. Introduce new per vif frame registration API and use it for probe request registrations in ieee80211_mgmt_frame_register() Also call directly to ieee80211_configure_filter instead of using a work since it is now allowed to sleep in ieee80211_mgmt_frame_register. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 22 ++++++++++++++++++---- net/mac80211/driver-ops.h | 16 ++++++++++++++++ net/mac80211/trace.h | 30 ++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1ca972e..9eab783 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3516,18 +3516,32 @@ static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, u16 frame_type, bool reg) { struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); switch (frame_type) { case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: - if (reg) + if (reg) { local->probe_req_reg++; - else - local->probe_req_reg--; + sdata->vif.probe_req_reg++; + } else { + if (local->probe_req_reg) + local->probe_req_reg--; + + if (sdata->vif.probe_req_reg) + sdata->vif.probe_req_reg--; + } if (!local->open_count) break; - ieee80211_queue_work(&local->hw, &local->reconfig_filter); + if (sdata->vif.probe_req_reg == 1) + drv_config_iface_filter(local, sdata, FIF_PROBE_REQ, + FIF_PROBE_REQ); + else if (sdata->vif.probe_req_reg == 0) + drv_config_iface_filter(local, sdata, 0, + FIF_PROBE_REQ); + + ieee80211_configure_filter(local); break; default: break; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 02d9133..157b20b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -260,6 +260,22 @@ static inline void drv_configure_filter(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_config_iface_filter(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + unsigned int filter_flags, + unsigned int changed_flags) +{ + might_sleep(); + + trace_drv_config_iface_filter(local, sdata, filter_flags, + changed_flags); + if (local->ops->config_iface_filter) + local->ops->config_iface_filter(&local->hw, &sdata->vif, + filter_flags, + changed_flags); + trace_drv_return_void(local); +} + static inline int drv_set_tim(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 6f14591..b5960b9 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -497,6 +497,36 @@ TRACE_EVENT(drv_configure_filter, ) ); +TRACE_EVENT(drv_config_iface_filter, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + unsigned int filter_flags, + unsigned int changed_flags), + + TP_ARGS(local, sdata, filter_flags, changed_flags), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(unsigned int, filter_flags) + __field(unsigned int, changed_flags) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->filter_flags = filter_flags; + __entry->changed_flags = changed_flags; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + " filter_flags: %#x changed_flags: %#x", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags, + __entry->changed_flags + ) +); + TRACE_EVENT(drv_set_tim, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set), -- cgit v1.1 From e3abc8ff0fc18b3925fd5d5c5fbd1613856f4e7c Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 16 Aug 2015 11:13:22 +0300 Subject: mac80211: allow to transmit A-MSDU within A-MPDU Advertise the capability to send A-MSDU within A-MPDU in the AddBA request sent by mac80211. Let the driver know about the peer's capabilities. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 4 ++-- net/mac80211/agg-tx.c | 15 ++++++++++----- net/mac80211/driver-ops.h | 7 ++++--- net/mac80211/sta_info.h | 2 ++ net/mac80211/trace.h | 10 ++++++---- 5 files changed, 24 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 5c564a6..6ebe861 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -79,7 +79,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, (int)reason); if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP, - &sta->sta, tid, NULL, 0)) + &sta->sta, tid, NULL, 0, false)) sdata_info(sta->sdata, "HW problem - can not stop rx aggregation for %pM tid %d\n", sta->sta.addr, tid); @@ -321,7 +321,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, __skb_queue_head_init(&tid_agg_rx->reorder_buf[i]); ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, - &sta->sta, tid, &start_seq_num, 0); + &sta->sta, tid, &start_seq_num, 0, false); ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n", sta->sta.addr, tid, ret); if (ret) { diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index c8ba2e7..a758eb84 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -97,7 +97,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; mgmt->u.action.u.addba_req.dialog_token = dialog_token; - capab = (u16)(1 << 1); /* bit 1 aggregation policy */ + capab = (u16)(1 << 0); /* bit 0 A-MSDU support */ + capab |= (u16)(1 << 1); /* bit 1 aggregation policy */ capab |= (u16)(tid << 2); /* bit 5:2 TID number */ capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */ @@ -331,7 +332,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, return -EALREADY; ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, - &sta->sta, tid, NULL, 0); + &sta->sta, tid, NULL, 0, false); WARN_ON_ONCE(ret); return 0; } @@ -381,7 +382,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST; ret = drv_ampdu_action(local, sta->sdata, action, - &sta->sta, tid, NULL, 0); + &sta->sta, tid, NULL, 0, false); /* HW shall not deny going back to legacy */ if (WARN_ON(ret)) { @@ -469,7 +470,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) start_seq_num = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START, - &sta->sta, tid, &start_seq_num, 0); + &sta->sta, tid, &start_seq_num, 0, false); if (ret) { ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", @@ -693,7 +694,8 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_TX_OPERATIONAL, - &sta->sta, tid, NULL, tid_tx->buf_size); + &sta->sta, tid, NULL, tid_tx->buf_size, + tid_tx->amsdu); /* * synchronize with TX path, while splicing the TX path @@ -918,8 +920,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, struct tid_ampdu_tx *tid_tx; u16 capab, tid; u8 buf_size; + bool amsdu; capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); + amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; @@ -968,6 +972,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, } tid_tx->buf_size = buf_size; + tid_tx->amsdu = amsdu; if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)) ieee80211_agg_tx_operational(local, sta, tid); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 157b20b..31482e2 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -734,7 +734,7 @@ static inline int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum ieee80211_ampdu_mlme_action action, struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size) + u16 *ssn, u8 buf_size, bool amsdu) { int ret = -EOPNOTSUPP; @@ -744,11 +744,12 @@ static inline int drv_ampdu_action(struct ieee80211_local *local, if (!check_sdata_in_driver(sdata)) return -EIO; - trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size); + trace_drv_ampdu_action(local, sdata, action, sta, tid, + ssn, buf_size, amsdu); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, - sta, tid, ssn, buf_size); + sta, tid, ssn, buf_size, amsdu); trace_drv_return_int(local, ret); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index b087c71..d5ded87 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -133,6 +133,7 @@ enum ieee80211_agg_stop_reason { * @buf_size: reorder buffer size at receiver * @failed_bar_ssn: ssn of the last failed BAR tx attempt * @bar_pending: BAR needs to be re-sent + * @amsdu: support A-MSDU withing A-MDPU * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -158,6 +159,7 @@ struct tid_ampdu_tx { u16 failed_bar_ssn; bool bar_pending; + bool amsdu; }; /** diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index b5960b9..314e3bd 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -974,9 +974,9 @@ TRACE_EVENT(drv_ampdu_action, struct ieee80211_sub_if_data *sdata, enum ieee80211_ampdu_mlme_action action, struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size), + u16 *ssn, u8 buf_size, bool amsdu), - TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size), + TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu), TP_STRUCT__entry( LOCAL_ENTRY @@ -985,6 +985,7 @@ TRACE_EVENT(drv_ampdu_action, __field(u16, tid) __field(u16, ssn) __field(u8, buf_size) + __field(bool, amsdu) VIF_ENTRY ), @@ -996,12 +997,13 @@ TRACE_EVENT(drv_ampdu_action, __entry->tid = tid; __entry->ssn = ssn ? *ssn : 0; __entry->buf_size = buf_size; + __entry->amsdu = amsdu; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d", + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d", LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, - __entry->tid, __entry->buf_size + __entry->tid, __entry->buf_size, __entry->amsdu ) ); -- cgit v1.1 From 46cad4b7a131a215159d889fa88d0dc71d581908 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 15 Aug 2015 22:39:54 +0300 Subject: mac80211: remove direct probe step before authentication The direct probe step before authentication was done mostly for two reasons: 1) the BSS data could be stale 2) the beacon might not have included all IEs The concern (1) doesn't really seem to be relevant any more as we time out BSS information after about 30 seconds, and in fact the original patch only did the direct probe if the data was older than the BSS timeout to begin with. This condition got (likely inadvertedly) removed later though. Analysing this in more detail shows that since we mostly use data from the association response, the only real reason for needing the probe response was that the code validates the WMM parameters, and those are optional in beacons. As the previous patches removed that behaviour, we can now remove the direct probe step entirely. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 82 +++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 705ef1d..6daadf2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3262,16 +3262,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, if (ifmgd->associated && ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) ieee80211_reset_ap_probe(sdata); - - if (ifmgd->auth_data && !ifmgd->auth_data->bss->proberesp_ies && - ether_addr_equal(mgmt->bssid, ifmgd->auth_data->bss->bssid)) { - /* got probe response, continue with auth */ - sdata_info(sdata, "direct probe responded\n"); - ifmgd->auth_data->tries = 0; - ifmgd->auth_data->timeout = jiffies; - ifmgd->auth_data->timeout_started = true; - run_again(sdata, ifmgd->auth_data->timeout); - } } /* @@ -3717,12 +3707,14 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, reason); } -static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) +static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data; u32 tx_flags = 0; + u16 trans = 1; + u16 status = 0; sdata_assert_lock(sdata); @@ -3746,54 +3738,27 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) drv_mgd_prepare_tx(local, sdata); - if (auth_data->bss->proberesp_ies) { - u16 trans = 1; - u16 status = 0; - - sdata_info(sdata, "send auth to %pM (try %d/%d)\n", - auth_data->bss->bssid, auth_data->tries, - IEEE80211_AUTH_MAX_TRIES); - - auth_data->expected_transaction = 2; + sdata_info(sdata, "send auth to %pM (try %d/%d)\n", + auth_data->bss->bssid, auth_data->tries, + IEEE80211_AUTH_MAX_TRIES); - if (auth_data->algorithm == WLAN_AUTH_SAE) { - trans = auth_data->sae_trans; - status = auth_data->sae_status; - auth_data->expected_transaction = trans; - } + auth_data->expected_transaction = 2; - if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) - tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | - IEEE80211_TX_INTFL_MLME_CONN_TX; - - ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, - auth_data->data, auth_data->data_len, - auth_data->bss->bssid, - auth_data->bss->bssid, NULL, 0, 0, - tx_flags); - } else { - const u8 *ssidie; + if (auth_data->algorithm == WLAN_AUTH_SAE) { + trans = auth_data->sae_trans; + status = auth_data->sae_status; + auth_data->expected_transaction = trans; + } - sdata_info(sdata, "direct probe to %pM (try %d/%i)\n", - auth_data->bss->bssid, auth_data->tries, - IEEE80211_AUTH_MAX_TRIES); + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) + tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_INTFL_MLME_CONN_TX; - rcu_read_lock(); - ssidie = ieee80211_bss_get_ie(auth_data->bss, WLAN_EID_SSID); - if (!ssidie) { - rcu_read_unlock(); - return -EINVAL; - } - /* - * Direct probe is sent to broadcast address as some APs - * will not answer to direct packet in unassociated state. - */ - ieee80211_send_probe_req(sdata, sdata->vif.addr, NULL, - ssidie + 2, ssidie[1], - NULL, 0, (u32) -1, true, 0, - auth_data->bss->channel, false); - rcu_read_unlock(); - } + ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, + auth_data->data, auth_data->data_len, + auth_data->bss->bssid, + auth_data->bss->bssid, NULL, 0, 0, + tx_flags); if (tx_flags == 0) { auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; @@ -3874,8 +3839,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) bool status_acked = ifmgd->status_acked; ifmgd->status_received = false; - if (ifmgd->auth_data && - (ieee80211_is_probe_req(fc) || ieee80211_is_auth(fc))) { + if (ifmgd->auth_data && ieee80211_is_auth(fc)) { if (status_acked) { ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SHORT; @@ -3906,7 +3870,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) * so let's just kill the auth data */ ieee80211_destroy_auth_data(sdata, false); - } else if (ieee80211_probe_auth(sdata)) { + } else if (ieee80211_auth(sdata)) { u8 bssid[ETH_ALEN]; struct ieee80211_event event = { .type = MLME_EVENT, @@ -4597,7 +4561,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, if (err) goto err_clear; - err = ieee80211_probe_auth(sdata); + err = ieee80211_auth(sdata); if (err) { sta_info_destroy_addr(sdata, req->bss->bssid); goto err_clear; -- cgit v1.1 From 99e7ca44bb910f0cbfda5d9008e8517df0ebc939 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sat, 15 Aug 2015 22:39:51 +0300 Subject: mac80211: allow the driver to advertise A-MSDU within A-MPDU Rx support Drivers may be interested in receiving A-MSDU within A-MDPU. Not all the devices may be able to do so, make it configurable. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 4 +++- net/mac80211/debugfs.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 6ebe861..10ad4ac 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -189,6 +189,7 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; + bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU); u16 capab; skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); @@ -217,7 +218,8 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; mgmt->u.action.u.addba_resp.dialog_token = dialog_token; - capab = (u16)(policy << 1); /* bit 1 aggregation policy */ + capab = (u16)(amsdu << 0); /* bit 0 A-MSDU support */ + capab |= (u16)(policy << 1); /* bit 1 aggregation policy */ capab |= (u16)(tid << 2); /* bit 5:2 TID number */ capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index ced6bf3..41726fd 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -123,6 +123,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(SUPPORTS_CLONED_SKBS), FLAG(SINGLE_SCAN_ON_ALL_BANDS), FLAG(TDLS_WIDER_BW), + FLAG(SUPPORTS_AMSDU_IN_AMPDU), /* keep last for the build bug below */ (void *)0x1 -- cgit v1.1 From f020ae40b0c969d3fd3b320d0a05e62d5553ff72 Mon Sep 17 00:00:00 2001 From: Chun-Yeow Yeoh Date: Fri, 4 Sep 2015 10:58:05 +0800 Subject: mac80211: zero center freq segment 2 in VHT oper IE Clear the Channel Center Frequency Segment 2 in VHT operation IEs to avoid sending non-zero values if the SKB wasn't zeroed before adding the VHT operation IE. Signed-off-by: Chun-Yeow Yeoh [change commit message a bit - not necessarily just mesh related] Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 1104421..f167056 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2324,6 +2324,8 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, if (chandef->center_freq2) vht_oper->center_freq_seg2_idx = ieee80211_frequency_to_channel(chandef->center_freq2); + else + vht_oper->center_freq_seg2_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: -- cgit v1.1 From c85fb53c4fa6521352028c40ce096a808aabd389 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Thu, 27 Aug 2015 09:00:18 -0400 Subject: mac80211: implement VHT support for mesh Implement the basics required for supporting very high throughput with mesh: include VHT information elements in beacons, probe responses, and peering action frames, and check for compatible VHT configurations when peering. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-- net/mac80211/mesh.h | 4 +++ net/mac80211/mesh_plink.c | 9 +++++- 3 files changed, 85 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index e06a5ca..62b3e29 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -94,6 +94,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, ie->ht_operation, &sta_chan_def); + ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan, + ie->vht_operation, &sta_chan_def); + if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) return false; @@ -436,8 +439,6 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; - enum nl80211_channel_type channel_type = - cfg80211_get_chandef_type(&sdata->vif.bss_conf.chandef); struct ieee80211_supported_band *sband; struct ieee80211_sta_ht_cap *ht_cap; u8 *pos; @@ -454,7 +455,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; ht_cap = &sband->ht_cap; - if (!ht_cap->ht_supported || channel_type == NL80211_CHAN_NO_HT) + if (!ht_cap->ht_supported || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) return 0; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_operation)) @@ -467,6 +471,68 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; } +int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband; + u8 *pos; + + sband = local->hw.wiphy->bands[band]; + if (!sband->vht_cap.vht_supported || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return 0; + + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) + return -ENOMEM; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); + ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); + + return 0; +} + +int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_channel *channel; + struct ieee80211_supported_band *sband; + struct ieee80211_sta_vht_cap *vht_cap; + u8 *pos; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return -EINVAL; + } + channel = chanctx_conf->def.chan; + rcu_read_unlock(); + + sband = local->hw.wiphy->bands[channel->band]; + vht_cap = &sband->vht_cap; + + if (!vht_cap->vht_supported || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + return 0; + + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_operation)) + return -ENOMEM; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); + ieee80211_ie_build_vht_oper(pos, vht_cap, + &sdata->vif.bss_conf.chandef); + + return 0; +} + static void ieee80211_mesh_path_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = @@ -637,6 +703,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + ifmsh->mesh_id_len + 2 + sizeof(struct ieee80211_meshconf_ie) + 2 + sizeof(__le16) + /* awake window */ + 2 + sizeof(struct ieee80211_vht_cap) + + 2 + sizeof(struct ieee80211_vht_operation) + ifmsh->ie_len; bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); @@ -718,6 +786,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) mesh_add_meshid_ie(sdata, skb) || mesh_add_meshconf_ie(sdata, skb) || mesh_add_awake_window_ie(sdata, skb) || + mesh_add_vht_cap_ie(sdata, skb) || + mesh_add_vht_oper_ie(sdata, skb) || mesh_add_vendor_ies(sdata, skb)) goto out_free; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 50c8473..c60be85 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -227,6 +227,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 5838464..a360b24 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -226,6 +226,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_meshconf_ie) + 2 + sizeof(struct ieee80211_ht_cap) + 2 + sizeof(struct ieee80211_ht_operation) + + 2 + sizeof(struct ieee80211_vht_cap) + + 2 + sizeof(struct ieee80211_vht_operation) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); if (!skb) @@ -306,7 +308,9 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action != WLAN_SP_MESH_PEERING_CLOSE) { if (mesh_add_ht_cap_ie(sdata, skb) || - mesh_add_ht_oper_ie(sdata, skb)) + mesh_add_ht_oper_ie(sdata, skb) || + mesh_add_vht_cap_ie(sdata, skb) || + mesh_add_vht_oper_ie(sdata, skb)) goto free; } @@ -402,6 +406,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->ht_cap_elem, sta)) changed |= IEEE80211_RC_BW_CHANGED; + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, + elems->vht_cap_elem, sta); + if (bw != sta->sta.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; -- cgit v1.1 From 8e0d7fe07c3f8c2a5e3b5bdbfdf09de4da2e2dd4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 28 Aug 2015 10:52:52 +0200 Subject: mac80211: remove last_beacon/ave_beacon debugfs files These file aren't really useful: - if per beacon data is required then you need to use radiotap or similar anyway, debugfs won't help much - average beacon signal is reported in station info in nl80211 and can be looked up with iw Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 12 ------------ net/mac80211/ieee80211_i.h | 3 --- net/mac80211/mlme.c | 1 - 3 files changed, 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index f1580e9..37ea30e 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -114,14 +114,6 @@ static ssize_t ieee80211_if_fmt_##name( \ return scnprintf(buf, buflen, "%pM\n", sdata->field); \ } -#define IEEE80211_IF_FMT_DEC_DIV_16(name, field) \ -static ssize_t ieee80211_if_fmt_##name( \ - const struct ieee80211_sub_if_data *sdata, \ - char *buf, int buflen) \ -{ \ - return scnprintf(buf, buflen, "%d\n", sdata->field / 16); \ -} - #define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field) \ static ssize_t ieee80211_if_fmt_##name( \ const struct ieee80211_sub_if_data *sdata, \ @@ -247,8 +239,6 @@ IEEE80211_IF_FILE_R(hw_queues); /* STA attributes */ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); -IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC); -IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16); IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS); static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, @@ -634,8 +624,6 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(bssid); DEBUGFS_ADD(aid); - DEBUGFS_ADD(last_beacon); - DEBUGFS_ADD(ave_beacon); DEBUGFS_ADD(beacon_timeout); DEBUGFS_ADD_MODE(smps, 0600); DEBUGFS_ADD_MODE(tkip_mic_test, 0200); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 65f4faa..9482f32 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -490,9 +490,6 @@ struct ieee80211_if_managed { s16 p2p_noa_index; - /* Signal strength from the last Beacon frame in the current BSS. */ - int last_beacon_signal; - /* * Weighted average of the signal strength from Beacon frames in the * current BSS. This is in units of 1/16 of the signal unit to maintain diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6daadf2..ce01cd3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3364,7 +3364,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, bssid = ifmgd->associated->bssid; /* Track average RSSI from the Beacon frames of the current AP */ - ifmgd->last_beacon_signal = rx_status->signal; if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; ifmgd->ave_beacon_signal = rx_status->signal * 16; -- cgit v1.1 From 8ec6d97871f37e4743678ea4a455bd59580aa0f4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 28 Aug 2015 10:52:53 +0200 Subject: mac80211: fix driver RSSI event calculations The ifmgd->ave_beacon_signal value cannot be taken as is for comparisons, it must be divided by since it's represented like that for better accuracy of the EWMA calculations. This would lead to invalid driver RSSI events. Fix the used value. Fixes: 615f7b9bb1f8 ("mac80211: add driver RSSI threshold events") Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ce01cd3..79cfc2b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3380,7 +3380,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold && ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { - int sig = ifmgd->ave_beacon_signal; + int sig = ifmgd->ave_beacon_signal / 16; int last_sig = ifmgd->last_ave_beacon_signal; struct ieee80211_event event = { .type = RSSI_EVENT, -- cgit v1.1 From 338c17ae311e6b5a439573a4043fd2d9237cd1d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 28 Aug 2015 10:52:54 +0200 Subject: mac80211: use DECLARE_EWMA for ave_beacon_signal It doesn't seem problematic to change the weight for the average beacon signal from 3 to 4, so use DECLARE_EWMA. This also makes the code easier to maintain since bugs like the one fixed in the previous patch can't happen as easily. With a fix from Avraham Stern to invert the sign since EMWA uses unsigned values only. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 10 +++------- net/mac80211/mlme.c | 20 ++++++-------------- net/mac80211/util.c | 2 +- 3 files changed, 10 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9482f32..f0aee76 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -419,6 +419,8 @@ struct ieee80211_sta_tx_tspec { bool downgraded; }; +DECLARE_EWMA(beacon_signal, 16, 4) + struct ieee80211_if_managed { struct timer_list timer; struct timer_list conn_mon_timer; @@ -490,13 +492,7 @@ struct ieee80211_if_managed { s16 p2p_noa_index; - /* - * Weighted average of the signal strength from Beacon frames in the - * current BSS. This is in units of 1/16 of the signal unit to maintain - * accuracy and to speed up calculations, i.e., the value need to be - * divided by 16 to get the actual value. - */ - int ave_beacon_signal; + struct ewma_beacon_signal ave_beacon_signal; /* * Number of Beacon frames used in ave_beacon_signal. This can be used diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 79cfc2b..c7d316b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -82,13 +82,6 @@ MODULE_PARM_DESC(probe_wait_ms, " before disconnecting (reason 4)."); /* - * Weight given to the latest Beacon frame when calculating average signal - * strength for Beacon frames received in the current BSS. This must be - * between 1 and 15. - */ -#define IEEE80211_SIGNAL_AVE_WEIGHT 3 - -/* * How many Beacon frames need to have been used in average signal strength * before starting to indicate signal change events. */ @@ -3366,21 +3359,19 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, /* Track average RSSI from the Beacon frames of the current AP */ if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; - ifmgd->ave_beacon_signal = rx_status->signal * 16; + ewma_beacon_signal_init(&ifmgd->ave_beacon_signal); ifmgd->last_cqm_event_signal = 0; ifmgd->count_beacon_signal = 1; ifmgd->last_ave_beacon_signal = 0; } else { - ifmgd->ave_beacon_signal = - (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 + - (16 - IEEE80211_SIGNAL_AVE_WEIGHT) * - ifmgd->ave_beacon_signal) / 16; ifmgd->count_beacon_signal++; } + ewma_beacon_signal_add(&ifmgd->ave_beacon_signal, -rx_status->signal); + if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold && ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { - int sig = ifmgd->ave_beacon_signal / 16; + int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); int last_sig = ifmgd->last_ave_beacon_signal; struct ieee80211_event event = { .type = RSSI_EVENT, @@ -3407,10 +3398,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (bss_conf->cqm_rssi_thold && ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) { - int sig = ifmgd->ave_beacon_signal / 16; + int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); int last_event = ifmgd->last_cqm_event_signal; int thold = bss_conf->cqm_rssi_thold; int hyst = bss_conf->cqm_rssi_hyst; + if (sig < thold && (last_event == 0 || sig < last_event - hyst)) { ifmgd->last_cqm_event_signal = sig; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f167056..9cabf07 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2543,7 +2543,7 @@ int ieee80211_ave_rssi(struct ieee80211_vif *vif) /* non-managed type inferfaces */ return 0; } - return ifmgd->ave_beacon_signal / 16; + return -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); -- cgit v1.1 From d55d0d598e6610bbfcc1f2ecd6e8af669b94783b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 31 Aug 2015 22:59:38 +0200 Subject: nl80211: put current TX power in interface info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many drivers implement reading current TX power (using either cfg80211 or ieee80211 op) but userspace can't get it using nl80211. Right now the only way to access it is to call some wext ioctl. Let's put TX power in interface info reply (callback is wdev specific) just like we do with current channel. To be consistent (e.g. NL80211_CMD_SET_WIPHY) let's use mBm as na unit. Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a4e6c95..50cd770 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2404,6 +2404,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag } } + if (rdev->ops->get_tx_power) { + int dbm, ret; + + ret = rdev_get_tx_power(rdev, wdev, &dbm); + if (ret == 0 && + nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, + DBM_TO_MBM(dbm))) + goto nla_put_failure; + } + if (wdev->ssid_len) { if (nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) goto nla_put_failure; -- cgit v1.1 From 4dc792b8f098ab6327033fc97ba40163a2cd5fcc Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Wed, 2 Sep 2015 13:23:30 +0200 Subject: mac80211: Split sending tx'ed frames to monitor interfaces into its own function This allows ieee80211_tx_monitor to be used directly for sending 802.11 frames to all monitor interfaces. Signed-off-by: Helmut Schaa Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 ++ net/mac80211/status.c | 108 +++++++++++++++++++++++++-------------------- 2 files changed, 62 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f0aee76..1af655a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1635,6 +1635,9 @@ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); +void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, + struct ieee80211_supported_band *sband, + int retry_count, int shift, bool send_to_cooked); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 8ba5832..98fd04c4 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -668,16 +668,70 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_tx_status_noskb); -void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, + struct ieee80211_supported_band *sband, + int retry_count, int shift, bool send_to_cooked) { struct sk_buff *skb2; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_sub_if_data *sdata; + struct net_device *prev_dev = NULL; + int rtap_len; + + /* send frame to monitor interfaces now */ + rtap_len = ieee80211_tx_radiotap_len(info); + if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { + pr_err("ieee80211_tx_status: headroom too small\n"); + dev_kfree_skb(skb); + return; + } + ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, + rtap_len, shift); + + /* XXX: is this sufficient for BPF? */ + skb_set_mac_header(skb, 0); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_OTHERHOST; + skb->protocol = htons(ETH_P_802_2); + memset(skb->cb, 0, sizeof(skb->cb)); + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && + !send_to_cooked) + continue; + + if (prev_dev) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) { + skb2->dev = prev_dev; + netif_rx(skb2); + } + } + + prev_dev = sdata->dev; + } + } + if (prev_dev) { + skb->dev = prev_dev; + netif_rx(skb); + skb = NULL; + } + rcu_read_unlock(); + dev_kfree_skb(skb); +} + +void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +{ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); __le16 fc; struct ieee80211_supported_band *sband; - struct ieee80211_sub_if_data *sdata; - struct net_device *prev_dev = NULL; struct sta_info *sta; struct rhash_head *tmp; int retry_count; @@ -685,7 +739,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) bool send_to_cooked; bool acked; struct ieee80211_bar *bar; - int rtap_len; int shift = 0; int tid = IEEE80211_NUM_TIDS; const struct bucket_table *tbl; @@ -878,51 +931,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) return; } - /* send frame to monitor interfaces now */ - rtap_len = ieee80211_tx_radiotap_len(info); - if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { - pr_err("ieee80211_tx_status: headroom too small\n"); - dev_kfree_skb(skb); - return; - } - ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, - rtap_len, shift); - - /* XXX: is this sufficient for BPF? */ - skb_set_mac_header(skb, 0); - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->pkt_type = PACKET_OTHERHOST; - skb->protocol = htons(ETH_P_802_2); - memset(skb->cb, 0, sizeof(skb->cb)); - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { - if (!ieee80211_sdata_running(sdata)) - continue; - - if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && - !send_to_cooked) - continue; - - if (prev_dev) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2) { - skb2->dev = prev_dev; - netif_rx(skb2); - } - } - - prev_dev = sdata->dev; - } - } - if (prev_dev) { - skb->dev = prev_dev; - netif_rx(skb); - skb = NULL; - } - rcu_read_unlock(); - dev_kfree_skb(skb); + /* send to monitor interfaces */ + ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked); } EXPORT_SYMBOL(ieee80211_tx_status); -- cgit v1.1 From 594b31ea7dc6101519deee1b31483fce2e1a7414 Mon Sep 17 00:00:00 2001 From: Frederic Danis Date: Wed, 23 Sep 2015 18:18:07 +0200 Subject: Bluetooth: Add BT_WARN and bt_dev_warn logging macros Add warning logging macros to bluetooth subsystem logs. Signed-off-by: Frederic Danis Signed-off-by: Marcel Holtmann --- net/bluetooth/lib.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 8b4cdce..aa4cf64 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -151,6 +151,22 @@ void bt_info(const char *format, ...) } EXPORT_SYMBOL(bt_info); +void bt_warn(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_warn("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_warn); + void bt_err(const char *format, ...) { struct va_format vaf; -- cgit v1.1 From c2d5ecfaeafdedfb997a466c654c7029c511f43d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 24 Sep 2015 09:37:11 +0200 Subject: mac802154: iface: assume big endian for af_packet The callback "create" and "parse" from header_ops are called from netdev core upper-layer functionality, like af_packet. These callbacks assumes big endian for addresses and we should not introduce a special byteordering handling for ieee802154 over af_packet in userspace. We have an identical issue with setting the mac address which also assumes big endian byteordering. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index b5a0936..3954bcf 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -461,7 +461,7 @@ static int mac802154_header_create(struct sk_buff *skb, hdr.dest.pan_id = wpan_dev->pan_id; hdr.dest.mode = IEEE802154_ADDR_LONG; - memcpy(&hdr.dest.extended_addr, daddr, IEEE802154_EXTENDED_ADDR_LEN); + ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); hdr.source.pan_id = hdr.dest.pan_id; hdr.source.mode = IEEE802154_ADDR_LONG; @@ -469,8 +469,7 @@ static int mac802154_header_create(struct sk_buff *skb, if (!saddr) hdr.source.extended_addr = wpan_dev->extended_addr; else - memcpy(&hdr.source.extended_addr, saddr, - IEEE802154_EXTENDED_ADDR_LEN); + ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) @@ -496,8 +495,7 @@ mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) } if (hdr.source.mode == IEEE802154_ADDR_LONG) { - memcpy(haddr, &hdr.source.extended_addr, - IEEE802154_EXTENDED_ADDR_LEN); + ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); return IEEE802154_EXTENDED_ADDR_LEN; } -- cgit v1.1 From fbef168fec837ae26c8725737cd4b49dc8a0f917 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Tue, 29 Sep 2015 15:05:44 +0200 Subject: Bluetooth: Add hci_cmd_sync function Send a HCI command and wait for command complete event. This function serializes the requests by grabbing the req_lock. Signed-off-by: Loic Poulain Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index a7cdd99..7935646 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3580,6 +3580,25 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } +/* Send HCI command and wait for command commplete event */ +struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + struct sk_buff *skb; + + if (!test_bit(HCI_UP, &hdev->flags)) + return ERR_PTR(-ENETDOWN); + + bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); + + hci_req_lock(hdev); + skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); + hci_req_unlock(hdev); + + return skb; +} +EXPORT_SYMBOL(hci_cmd_sync); + /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { -- cgit v1.1 From 35afa588624c4f9e19a0edfbb51769b59c90bb0d Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Wed, 9 Sep 2015 09:46:32 +0200 Subject: mac80211: Copy tx'ed beacons to monitor mode When debugging wireless powersave issues on the AP side it's quite helpful to see our own beacons that are transmitted by the hardware/driver. However, this is not that easy since beacons don't pass through the regular TX queues. Preferably drivers would call ieee80211_tx_status also for tx'ed beacons but that's not always possible. Hence, just send a copy of each beacon generated by ieee80211_beacon_get_tim to monitor devices when they are getting fetched by the driver. Also add a HW flag IEEE80211_HW_BEACON_TX_STATUS that can be used by drivers to indicate that they report TX status for beacons. Signed-off-by: Helmut Schaa (with a fix from Christian Lamparted rolled in) Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 1 + net/mac80211/tx.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 41726fd..3636b45 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -124,6 +124,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(SINGLE_SCAN_ON_ALL_BANDS), FLAG(TDLS_WIDER_BW), FLAG(SUPPORTS_AMSDU_IN_AMPDU), + FLAG(BEACON_TX_STATUS), /* keep last for the build bug below */ (void *)0x1 diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 84e0e8c..7354072 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3512,6 +3512,12 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, { struct ieee80211_mutable_offsets offs = {}; struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false); + struct sk_buff *copy; + struct ieee80211_supported_band *sband; + int shift; + + if (!bcn) + return bcn; if (tim_offset) *tim_offset = offs.tim_offset; @@ -3519,6 +3525,19 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (tim_length) *tim_length = offs.tim_length; + if (ieee80211_hw_check(hw, BEACON_TX_STATUS) || + !hw_to_local(hw)->monitors) + return bcn; + + /* send a copy to monitor interfaces */ + copy = skb_copy(bcn, GFP_ATOMIC); + if (!copy) + return bcn; + + shift = ieee80211_vif_get_shift(vif); + sband = hw->wiphy->bands[ieee80211_get_sdata_band(vif_to_sdata(vif))]; + ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false); + return bcn; } EXPORT_SYMBOL(ieee80211_beacon_get_tim); -- cgit v1.1 From b23dcd4aca1854cda520def01731ad035cae94d8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 15:19:34 +0200 Subject: mac80211: Deinline drv_conf_tx() With this .config: http://busybox.net/~vda/kernel_config_ALLYES_Os, after deinlining the function size is 785 bytes and there are 7 callsites. Total size reduction is about 3.5 kbytes. Signed-off-by: Denys Vlasenko CC: John Linville CC: Michal Kazior CC: Johannes Berg CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 25 +++++++++++++++++++++++++ net/mac80211/driver-ops.h | 27 +++------------------------ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 267c3b1..b28e66ca 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -39,3 +39,28 @@ int drv_sta_state(struct ieee80211_local *local, trace_drv_return_int(local, ret); return ret; } + +int drv_conf_tx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, u16 ac, + const struct ieee80211_tx_queue_params *params) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return -EIO; + + if (WARN_ONCE(params->cw_min == 0 || + params->cw_min > params->cw_max, + "%s: invalid CW_min/CW_max: %d/%d\n", + sdata->name, params->cw_min, params->cw_max)) + return -EINVAL; + + trace_drv_conf_tx(local, sdata, ac, params); + if (local->ops->conf_tx) + ret = local->ops->conf_tx(&local->hw, &sdata->vif, + ac, params); + trace_drv_return_int(local, ret); + return ret; +} diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 31482e2..6cc6bd4 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -646,30 +646,9 @@ static inline void drv_sta_statistics(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline int drv_conf_tx(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, u16 ac, - const struct ieee80211_tx_queue_params *params) -{ - int ret = -EOPNOTSUPP; - - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return -EIO; - - if (WARN_ONCE(params->cw_min == 0 || - params->cw_min > params->cw_max, - "%s: invalid CW_min/CW_max: %d/%d\n", - sdata->name, params->cw_min, params->cw_max)) - return -EINVAL; - - trace_drv_conf_tx(local, sdata, ac, params); - if (local->ops->conf_tx) - ret = local->ops->conf_tx(&local->hw, &sdata->vif, - ac, params); - trace_drv_return_int(local, ret); - return ret; -} +int drv_conf_tx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, u16 ac, + const struct ieee80211_tx_queue_params *params); static inline u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) -- cgit v1.1 From 4fbd572c29bd184146e8adf52631db193c4e34b9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 15:19:35 +0200 Subject: mac80211: Deinline drv_sta_rc_update() With this .config: http://busybox.net/~vda/kernel_config_ALLYES_Os, after deinlining the function size is 706 bytes and there are 2 callsites, reducing code size by about 700 bytes. Signed-off-by: Denys Vlasenko CC: John Linville CC: Michal Kazior CC: Johannes Berg CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 20 ++++++++++++++++++++ net/mac80211/driver-ops.h | 22 +++------------------- 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index b28e66ca..b85f6ff 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -40,6 +40,26 @@ int drv_sta_state(struct ieee80211_local *local, return ret; } +void drv_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, u32 changed) +{ + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && + (sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); + + trace_drv_sta_rc_update(local, sdata, sta, changed); + if (local->ops->sta_rc_update) + local->ops->sta_rc_update(&local->hw, &sdata->vif, + sta, changed); + + trace_drv_return_void(local); +} + int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 ac, const struct ieee80211_tx_queue_params *params) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 6cc6bd4..2937bcb 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -596,25 +596,9 @@ int drv_sta_state(struct ieee80211_local *local, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); -static inline void drv_sta_rc_update(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u32 changed) -{ - sdata = get_bss_sdata(sdata); - if (!check_sdata_in_driver(sdata)) - return; - - WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && - (sdata->vif.type != NL80211_IFTYPE_ADHOC && - sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); - - trace_drv_sta_rc_update(local, sdata, sta, changed); - if (local->ops->sta_rc_update) - local->ops->sta_rc_update(&local->hw, &sdata->vif, - sta, changed); - - trace_drv_return_void(local); -} +void drv_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, -- cgit v1.1 From 9aae296a6208188fb40da987efb6bcd92f4fb169 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 15:19:38 +0200 Subject: mac80211: Deinline drv_add/remove/change_interface() With this .config: http://busybox.net/~vda/kernel_config_ALLYES_Os, after deinlining these functions have sizes and callsite counts as follows: drv_add_interface: 638 bytes, 5 calls drv_remove_interface: 611 bytes, 6 calls drv_change_interface: 658 bytes, 1 call Total size reduction is about 9 kbytes. Signed-off-by: Denys Vlasenko CC: John Linville CC: Michal Kazior CC: Johannes Berg CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 54 +++++++++++++++++++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 58 ++++++----------------------------------------- 2 files changed, 61 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index b85f6ff..b284e6e 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -8,6 +8,60 @@ #include "trace.h" #include "driver-ops.h" +int drv_add_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + int ret; + + might_sleep(); + + if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) + return -EINVAL; + + trace_drv_add_interface(local, sdata); + ret = local->ops->add_interface(&local->hw, &sdata->vif); + trace_drv_return_int(local, ret); + + if (ret == 0) + sdata->flags |= IEEE80211_SDATA_IN_DRIVER; + + return ret; +} + +int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type, bool p2p) +{ + int ret; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_change_interface(local, sdata, type, p2p); + ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); + trace_drv_return_int(local, ret); + return ret; +} + +void drv_remove_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_remove_interface(local, sdata); + local->ops->remove_interface(&local->hw, &sdata->vif); + sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; + trace_drv_return_void(local); +} + __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 2937bcb..0baeefd 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -137,59 +137,15 @@ static inline void drv_set_wakeup(struct ieee80211_local *local, } #endif -static inline int drv_add_interface(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - int ret; - - might_sleep(); - - if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && - !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) - return -EINVAL; - - trace_drv_add_interface(local, sdata); - ret = local->ops->add_interface(&local->hw, &sdata->vif); - trace_drv_return_int(local, ret); - - if (ret == 0) - sdata->flags |= IEEE80211_SDATA_IN_DRIVER; - - return ret; -} - -static inline int drv_change_interface(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - enum nl80211_iftype type, bool p2p) -{ - int ret; - - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return -EIO; - - trace_drv_change_interface(local, sdata, type, p2p); - ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); - trace_drv_return_int(local, ret); - return ret; -} +int drv_add_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); -static inline void drv_remove_interface(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return; +int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type, bool p2p); - trace_drv_remove_interface(local, sdata); - local->ops->remove_interface(&local->hw, &sdata->vif); - sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; - trace_drv_return_void(local); -} +void drv_remove_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); static inline int drv_config(struct ieee80211_local *local, u32 changed) { -- cgit v1.1 From 9f0e13546ef5773b7059b531a667ec47a5f897ee Mon Sep 17 00:00:00 2001 From: "Fu, Zhonghui" Date: Sat, 19 Sep 2015 10:40:14 +0800 Subject: net/wireless: enable wiphy device to suspend/resume asynchronously Now, PM core supports asynchronous suspend/resume mode for devices during system suspend/resume, and the power state transition of one device may be completed in separate kernel thread. PM core ensures all power state transition timing dependency between devices. This patch enables wiphy device to suspend/resume asynchronously. This can take advantage of multicore and improve system suspend/resume speed. Signed-off-by: Zhonghui Fu Signed-off-by: Johannes Berg --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 3893409..f223026 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -419,6 +419,7 @@ use_default_name: device_initialize(&rdev->wiphy.dev); rdev->wiphy.dev.class = &ieee80211_class; rdev->wiphy.dev.platform_data = rdev; + device_enable_async_suspend(&rdev->wiphy.dev); INIT_LIST_HEAD(&rdev->destroy_list); spin_lock_init(&rdev->destroy_list_lock); -- cgit v1.1 From 27392719541c89595a5c03d49b599ddfe009e6b8 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Mon, 21 Sep 2015 15:50:26 +0300 Subject: mac80211: don't tear down aggregation on suspend in case of wowlan->any In case of "any" wowlan trigger, there is no reason to tear down aggregations, as we want the device to continue working normally. Similarly, there's no reason to tear down aggregations on resume, as they should have been torn down on suspend if needed. However, since the reconfiguration flow is shared with HW restart, tear down aggregations on reconfiguration when we are not resuming. To keep things working after non-wowlan suspend, keep clearing the WLAN_STA_BLOCK_BA flag. Signed-off-by: Eliad Peller Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/pm.c | 3 ++- net/mac80211/util.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index b676b9f..ad88ad4 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -23,7 +23,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) ieee80211_del_virtual_monitor(local); - if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION) && + !(wowlan && wowlan->any)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 9cabf07..62948bb 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2017,8 +2017,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { - ieee80211_sta_tear_down_BA_sessions( - sta, AGG_STOP_LOCAL_REQUEST); + if (!local->resuming) + ieee80211_sta_tear_down_BA_sessions( + sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } -- cgit v1.1 From a0c391b1345cfaecfb24c3c07378d80f6168fb61 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Sep 2015 10:29:20 +0200 Subject: mac80211: minstrel[_ht]: remove non-ascii debugfs characters Replace the average symbol by "avg" to avoid being warned about the non-ASCII symbol all the time, line up the columns properly. (I changed my mind - the warnings are getting annoying) Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_debugfs.c | 12 +++++------- net/mac80211/rc80211_minstrel_ht_debugfs.c | 12 +++++------- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 1db5f7c..820b0ab 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -85,12 +85,10 @@ minstrel_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; p += sprintf(p, "\n"); - p += sprintf(p, "best __________rate_________ ______" - "statistics______ ________last_______ " - "______sum-of________\n"); - p += sprintf(p, "rate [name idx airtime max_tp] [ ø(tp) ø(prob) " - "sd(prob)] [prob.|retry|suc|att] " - "[#success | #attempts]\n"); + p += sprintf(p, + "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n"); + p += sprintf(p, + "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; @@ -112,7 +110,7 @@ minstrel_stats_open(struct inode *inode, struct file *file) prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" " %3u.%1u %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 6822ce0..5320e35 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -86,7 +86,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" " %3u.%1u %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, @@ -129,12 +129,10 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) p = ms->buf; p += sprintf(p, "\n"); - p += sprintf(p, " best ____________rate__________ " - "______statistics______ ________last_______ " - "______sum-of________\n"); - p += sprintf(p, "mode guard # rate [name idx airtime max_tp] " - "[ ø(tp) ø(prob) sd(prob)] [prob.|retry|suc|att] [#success | " - "#attempts]\n"); + p += sprintf(p, + " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n"); + p += sprintf(p, + "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) -- cgit v1.1 From 0e5c371aa05522ac14e91ddee0522ad855e12d02 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Sep 2015 14:02:47 +0200 Subject: mac80211: improve __rate_control_send_low warning If there are no supported rates in the rate mask with the required flags, we warn, but it's not clear which part causes the warning. Add the relevant data to the warning to understand why it happens. Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 9857693..48d053b 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -305,7 +305,10 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, info->control.rates[0].idx = i; break; } - WARN_ON_ONCE(i == sband->n_bitrates); + WARN_ONCE(i == sband->n_bitrates, + "no supported rates (0x%x) in rate_mask 0x%x with flags 0x%x\n", + sta ? sta->supp_rates[sband->band] : 0, + rate_mask, rate_flags); info->control.rates[0].count = (info->flags & IEEE80211_TX_CTL_NO_ACK) ? -- cgit v1.1 From 42677ed33a8b6995e6af2ec15643840afcf1c48b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 23 Sep 2015 14:18:34 +0200 Subject: mac80211: Deinline drv_switch_vif_chanctx() With this .config: http://busybox.net/~vda/kernel_config_ALLYES_Os, after deinlining the function size is 821 bytes and there are 2 callsites, reducing code size by about 800 bytes. Signed-off-by: Denys Vlasenko CC: Johannes Berg CC: John Linville CC: Michal Kazior CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org [adjust code-style a bit] Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 51 +++++++++++++++++++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 55 +++-------------------------------------------- 2 files changed, 54 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index b284e6e..1a720e8 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -138,3 +138,54 @@ int drv_conf_tx(struct ieee80211_local *local, trace_drv_return_int(local, ret); return ret; } + +int drv_switch_vif_chanctx(struct ieee80211_local *local, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, enum ieee80211_chanctx_switch_mode mode) +{ + int ret = 0; + int i; + + if (!local->ops->switch_vif_chanctx) + return -EOPNOTSUPP; + + for (i = 0; i < n_vifs; i++) { + struct ieee80211_chanctx *new_ctx = + container_of(vifs[i].new_ctx, + struct ieee80211_chanctx, + conf); + struct ieee80211_chanctx *old_ctx = + container_of(vifs[i].old_ctx, + struct ieee80211_chanctx, + conf); + + WARN_ON_ONCE(!old_ctx->driver_present); + WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && + new_ctx->driver_present) || + (mode == CHANCTX_SWMODE_REASSIGN_VIF && + !new_ctx->driver_present)); + } + + trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); + ret = local->ops->switch_vif_chanctx(&local->hw, + vifs, n_vifs, mode); + trace_drv_return_int(local, ret); + + if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { + for (i = 0; i < n_vifs; i++) { + struct ieee80211_chanctx *new_ctx = + container_of(vifs[i].new_ctx, + struct ieee80211_chanctx, + conf); + struct ieee80211_chanctx *old_ctx = + container_of(vifs[i].old_ctx, + struct ieee80211_chanctx, + conf); + + new_ctx->driver_present = true; + old_ctx->driver_present = false; + } + } + + return ret; +} diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 0baeefd..275146e 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1002,58 +1002,9 @@ static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline int -drv_switch_vif_chanctx(struct ieee80211_local *local, - struct ieee80211_vif_chanctx_switch *vifs, - int n_vifs, - enum ieee80211_chanctx_switch_mode mode) -{ - int ret = 0; - int i; - - if (!local->ops->switch_vif_chanctx) - return -EOPNOTSUPP; - - for (i = 0; i < n_vifs; i++) { - struct ieee80211_chanctx *new_ctx = - container_of(vifs[i].new_ctx, - struct ieee80211_chanctx, - conf); - struct ieee80211_chanctx *old_ctx = - container_of(vifs[i].old_ctx, - struct ieee80211_chanctx, - conf); - - WARN_ON_ONCE(!old_ctx->driver_present); - WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && - new_ctx->driver_present) || - (mode == CHANCTX_SWMODE_REASSIGN_VIF && - !new_ctx->driver_present)); - } - - trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); - ret = local->ops->switch_vif_chanctx(&local->hw, - vifs, n_vifs, mode); - trace_drv_return_int(local, ret); - - if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { - for (i = 0; i < n_vifs; i++) { - struct ieee80211_chanctx *new_ctx = - container_of(vifs[i].new_ctx, - struct ieee80211_chanctx, - conf); - struct ieee80211_chanctx *old_ctx = - container_of(vifs[i].old_ctx, - struct ieee80211_chanctx, - conf); - - new_ctx->driver_present = true; - old_ctx->driver_present = false; - } - } - - return ret; -} +int drv_switch_vif_chanctx(struct ieee80211_local *local, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, enum ieee80211_chanctx_switch_mode mode); static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) -- cgit v1.1 From 6db96838971eb4c8ae6285795188f391e97d47c3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 23 Sep 2015 14:18:35 +0200 Subject: mac80211: Deinline drv_ampdu_action() With this .config: http://busybox.net/~vda/kernel_config_ALLYES_Os, after deinlining the function size is 755 bytes and there are 6 callsites. Total size reduction is about 3.3 kbytes. Signed-off-by: Denys Vlasenko CC: Johannes Berg CC: John Linville CC: Michal Kazior CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 26 ++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 30 +++++------------------------- 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 1a720e8..e1bb9e04 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -189,3 +189,29 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local, return ret; } + +int drv_ampdu_action(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_ampdu_mlme_action action, + struct ieee80211_sta *sta, u16 tid, + u16 *ssn, u8 buf_size, bool amsdu) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_ampdu_action(local, sdata, action, sta, tid, + ssn, buf_size, amsdu); + + if (local->ops->ampdu_action) + ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, + sta, tid, ssn, buf_size, amsdu); + + trace_drv_return_int(local, ret); + + return ret; +} diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 275146e..6411c3b9 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -649,31 +649,11 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local) return ret; } -static inline int drv_ampdu_action(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, - u16 *ssn, u8 buf_size, bool amsdu) -{ - int ret = -EOPNOTSUPP; - - might_sleep(); - - sdata = get_bss_sdata(sdata); - if (!check_sdata_in_driver(sdata)) - return -EIO; - - trace_drv_ampdu_action(local, sdata, action, sta, tid, - ssn, buf_size, amsdu); - - if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, - sta, tid, ssn, buf_size, amsdu); - - trace_drv_return_int(local, ret); - - return ret; -} +int drv_ampdu_action(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_ampdu_mlme_action action, + struct ieee80211_sta *sta, u16 tid, + u16 *ssn, u8 buf_size, bool amsdu); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) -- cgit v1.1 From 416eb9fc29469f036c85b412edf89774d6b34b0f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 23 Sep 2015 14:18:36 +0200 Subject: mac80211: Deinline drv_get/set/reset_tsf() With this .config: http://busybox.net/~vda/kernel_config_ALLYES_Os, after deinlining these functions have sizes and callsite counts as follows: drv_get_tsf: 634 bytes, 6 calls drv_set_tsf: 626 bytes, 2 calls drv_reset_tsf: 617 bytes, 2 calls Total size reduction is about 4.2 kbytes. Signed-off-by: Denys Vlasenko CC: Johannes Berg CC: John Linville CC: Michal Kazior CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.c | 46 +++++++++++++++++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 52 +++++++---------------------------------------- 2 files changed, 53 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index e1bb9e04..a1d5431 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -139,6 +139,52 @@ int drv_conf_tx(struct ieee80211_local *local, return ret; } +u64 drv_get_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + u64 ret = -1ULL; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return ret; + + trace_drv_get_tsf(local, sdata); + if (local->ops->get_tsf) + ret = local->ops->get_tsf(&local->hw, &sdata->vif); + trace_drv_return_u64(local, ret); + return ret; +} + +void drv_set_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u64 tsf) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_set_tsf(local, sdata, tsf); + if (local->ops->set_tsf) + local->ops->set_tsf(&local->hw, &sdata->vif, tsf); + trace_drv_return_void(local); +} + +void drv_reset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_reset_tsf(local, sdata); + if (local->ops->reset_tsf) + local->ops->reset_tsf(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 6411c3b9..3098709 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -590,51 +590,13 @@ int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 ac, const struct ieee80211_tx_queue_params *params); -static inline u64 drv_get_tsf(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - u64 ret = -1ULL; - - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return ret; - - trace_drv_get_tsf(local, sdata); - if (local->ops->get_tsf) - ret = local->ops->get_tsf(&local->hw, &sdata->vif); - trace_drv_return_u64(local, ret); - return ret; -} - -static inline void drv_set_tsf(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - u64 tsf) -{ - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return; - - trace_drv_set_tsf(local, sdata, tsf); - if (local->ops->set_tsf) - local->ops->set_tsf(&local->hw, &sdata->vif, tsf); - trace_drv_return_void(local); -} - -static inline void drv_reset_tsf(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) -{ - might_sleep(); - - if (!check_sdata_in_driver(sdata)) - return; - - trace_drv_reset_tsf(local, sdata); - if (local->ops->reset_tsf) - local->ops->reset_tsf(&local->hw, &sdata->vif); - trace_drv_return_void(local); -} +u64 drv_get_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); +void drv_set_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u64 tsf); +void drv_reset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); static inline int drv_tx_last_beacon(struct ieee80211_local *local) { -- cgit v1.1 From d0a77c6569abe29d921148c45f598bc796084226 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Sep 2015 10:42:28 +0200 Subject: mac80211: allow writing TX PN in debugfs For certain tests, for example replay detection, it can be useful to be able to influence/set the PN used in outgoing packets. Make it possible to change the TX PN in debugfs. For now, this doesn't support TKIP since I haven't needed it, but there's no reason it couldn't be added if necessary. Note that this must be used very carefully: it could, for example, be used to make "valid replays" where the PN reuse happens on a different TID. This couldn't be done by an attacker since the TID is protected as part of the AAD. Signed-off-by: Johannes Berg --- net/mac80211/debugfs_key.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 702ca12..7961e7d0 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -2,6 +2,7 @@ * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2007 Johannes Berg + * Copyright (C) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -34,6 +35,14 @@ static const struct file_operations key_ ##name## _ops = { \ .llseek = generic_file_llseek, \ } +#define KEY_OPS_W(name) \ +static const struct file_operations key_ ##name## _ops = { \ + .read = key_##name##_read, \ + .write = key_##name##_write, \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + #define KEY_FILE(name, format) \ KEY_READ_##format(name) \ KEY_OPS(name) @@ -74,6 +83,41 @@ static ssize_t key_algorithm_read(struct file *file, } KEY_OPS(algorithm); +static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + u64 pn; + int ret; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + return -EINVAL; + case WLAN_CIPHER_SUITE_TKIP: + /* not supported yet */ + return -EOPNOTSUPP; + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + ret = kstrtou64_from_user(userbuf, count, 16, &pn); + if (ret) + return ret; + /* PN is a 48-bit counter */ + if (pn >= (1ULL << 48)) + return -ERANGE; + atomic64_set(&key->conf.tx_pn, pn); + return count; + default: + return 0; + } +} + static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -110,7 +154,7 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } -KEY_OPS(tx_spec); +KEY_OPS_W(tx_spec); static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -278,6 +322,9 @@ KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ key, &key_##name##_ops); +#define DEBUGFS_ADD_W(name) \ + debugfs_create_file(#name, 0600, key->debugfs.dir, \ + key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) { @@ -310,7 +357,7 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); DEBUGFS_ADD(algorithm); - DEBUGFS_ADD(tx_spec); + DEBUGFS_ADD_W(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); DEBUGFS_ADD(icverrors); -- cgit v1.1 From 47edb11b522561658fe719e56aa69a3c3098a3fe Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Mon, 21 Sep 2015 15:49:53 +0300 Subject: cfg80211: allow changing station capabilities for unassociated stations Currently, cfg80211 rejects capability updates for existing entries and as a result it's impossible to update entries that were added unassociated, but that is necessary to go through the full station states from userspace, adding a station before authentication etc. Fix this by allowing updates to capabilities for stations that the driver (or mac80211) assigned unassociated state. Drivers setting the full station state support flag must use the new station type for proper operation. Signed-off-by: Ayala Beker Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 50cd770..f05ba8b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4009,7 +4009,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER); } - if (statype != CFG80211_STA_TDLS_PEER_SETUP) { + if (statype != CFG80211_STA_TDLS_PEER_SETUP && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) { /* reject other things that can't change */ if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) return -EINVAL; @@ -4021,7 +4022,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; } - if (statype != CFG80211_STA_AP_CLIENT) { + if (statype != CFG80211_STA_AP_CLIENT && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) { if (params->vlan) return -EINVAL; } @@ -4033,6 +4035,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EOPNOTSUPP; break; case CFG80211_STA_AP_CLIENT: + case CFG80211_STA_AP_CLIENT_UNASSOC: /* accept only the listed bits */ if (params->sta_flags_mask & ~(BIT(NL80211_STA_FLAG_AUTHORIZED) | -- cgit v1.1 From 44674d9c2267f454f38df7b2395939bfa911f92e Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Wed, 23 Sep 2015 10:41:27 +0200 Subject: mac80211: advertise support for full station state in AP mode This enables adding stations in unauthenticated mode, just after receiving the first authentication frame; which in turn allows sending a negative authentication reply if the station cannot be added. In addition init rate control for unassociated station only when it becomes associated, prior to that low rates will be used. Signed-off-by: Ayala Beker Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 33 ++++++++++++++++++++++----------- net/mac80211/main.c | 3 ++- 2 files changed, 24 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9eab783..1b91fcd 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -981,7 +981,7 @@ static int sta_apply_auth_flags(struct ieee80211_local *local, * well. Some drivers require rate control initialized * before drv_sta_state() is called. */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); @@ -1120,8 +1120,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, local->hw.queues >= IEEE80211_NUM_ACS) sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); - /* auth flags will be set later for TDLS stations */ - if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + /* auth flags will be set later for TDLS, + * and for unassociated stations that move to assocaited */ + if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && + (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; @@ -1213,7 +1216,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta_apply_mesh_params(local, sta, params); /* set the STA state after all sta info from usermode has been set */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || + set & BIT(NL80211_STA_FLAG_ASSOCIATED)) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; @@ -1255,12 +1259,14 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, * defaults -- if userspace wants something else we'll * change it accordingly in sta_apply_parameters() */ - if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) { + if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) && + !(params->sta_flags_set & (BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED)))) { sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - } else { - sta->sta.tdls = true; } + if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) + sta->sta.tdls = true; err = sta_apply_parameters(local, sta, params); if (err) { @@ -1269,10 +1275,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, } /* - * for TDLS, rate control should be initialized only when - * rates are known and station is marked authorized + * for TDLS and for unassociated station, rate control should be + * initialized only when rates are known and station is marked + * authorized/associated */ - if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN || @@ -1347,7 +1355,10 @@ static int ieee80211_change_station(struct wiphy *wiphy, break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: - statype = CFG80211_STA_AP_CLIENT; + if (test_sta_flag(sta, WLAN_STA_ASSOC)) + statype = CFG80211_STA_AP_CLIENT; + else + statype = CFG80211_STA_AP_CLIENT_UNASSOC; break; default: err = -EOPNOTSUPP; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ff79a13..9b813a2 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -543,7 +543,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_FEATURE_HT_IBSS | NL80211_FEATURE_VIF_TXPOWER | NL80211_FEATURE_MAC_ON_CREATE | - NL80211_FEATURE_USERSPACE_MPM; + NL80211_FEATURE_USERSPACE_MPM | + NL80211_FEATURE_FULL_AP_CLIENT_STATE; if (!ops->hw_scan) wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | -- cgit v1.1 From 50f36ae61a5b65ba4612a5d2aa696c8ac5b6c988 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 24 Sep 2015 14:59:48 +0200 Subject: mac80211: fix tx sequence number assignment with software queue + fast-xmit When using software queueing, tx sequence number assignment happens at ieee80211_tx_dequeue time, so the fast-xmit codepath must not do that. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7354072..464ba1a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2767,7 +2767,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { *ieee80211_get_qos_ctl(hdr) = tid; - hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); + if (!sta->sta.txq[0]) + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); -- cgit v1.1 From 90d13e8f5b3c2445f481be4a2012a1861337f718 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Sep 2015 16:13:07 +0200 Subject: mac80211: reduce indentation by inlining a check Instead of nesting two if statements, inline the second check into the first if statement and to indentation. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c7d316b..88894d9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3437,31 +3437,27 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, len - baselen, false, &elems, care_about_ies, ncrc); - if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) { - bool directed_tim = ieee80211_check_tim(elems.tim, - elems.tim_len, - ifmgd->aid); - if (directed_tim) { - if (local->hw.conf.dynamic_ps_timeout > 0) { - if (local->hw.conf.flags & IEEE80211_CONF_PS) { - local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, - IEEE80211_CONF_CHANGE_PS); - } - ieee80211_send_nullfunc(local, sdata, 0); - } else if (!local->pspolling && sdata->u.mgd.powersave) { - local->pspolling = true; - - /* - * Here is assumed that the driver will be - * able to send ps-poll frame and receive a - * response even though power save mode is - * enabled, but some drivers might require - * to disable power save here. This needs - * to be investigated. - */ - ieee80211_send_pspoll(local, sdata); + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && + ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) { + if (local->hw.conf.dynamic_ps_timeout > 0) { + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); } + ieee80211_send_nullfunc(local, sdata, 0); + } else if (!local->pspolling && sdata->u.mgd.powersave) { + local->pspolling = true; + + /* + * Here is assumed that the driver will be + * able to send ps-poll frame and receive a + * response even though power save mode is + * enabled, but some drivers might require + * to disable power save here. This needs + * to be investigated. + */ + ieee80211_send_pspoll(local, sdata); } } -- cgit v1.1 From 076cdcb12f784b2057f172b5caca641fafa67cdf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Sep 2015 16:14:55 +0200 Subject: mac80211: use bool argument to ieee80211_send_nullfunc Instead of int with 0/1, use bool with false/true for the powersave argument to ieee80211_send_nullfunc(). Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 10 +++++----- net/mac80211/offchannel.c | 6 +++--- net/mac80211/util.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 1af655a..f9605f1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1850,7 +1850,7 @@ void ieee80211_dynamic_ps_disable_work(struct work_struct *work); void ieee80211_dynamic_ps_timer(unsigned long data); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - int powersave); + bool powersave); void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 88894d9..88047bf 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -936,7 +936,7 @@ void ieee80211_send_pspoll(struct ieee80211_local *local, void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - int powersave) + bool powersave) { struct sk_buff *skb; struct ieee80211_hdr_3addr *nullfunc; @@ -1420,7 +1420,7 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) - ieee80211_send_nullfunc(local, sdata, 1); + ieee80211_send_nullfunc(local, sdata, true); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -1635,7 +1635,7 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); } else { - ieee80211_send_nullfunc(local, sdata, 1); + ieee80211_send_nullfunc(local, sdata, true); /* Flush to get the tx status of nullfunc frame */ ieee80211_flush_queues(local, sdata, false); } @@ -2268,7 +2268,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; - ieee80211_send_nullfunc(sdata->local, sdata, 0); + ieee80211_send_nullfunc(sdata->local, sdata, false); } else { int ssid_len; @@ -3445,7 +3445,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); } else if (!local->pspolling && sdata->u.mgd.powersave) { local->pspolling = true; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f2c75cf..0440103 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -57,7 +57,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) * to send a new nullfunc frame to inform the AP that we * are again sleeping. */ - ieee80211_send_nullfunc(local, sdata, 1); + ieee80211_send_nullfunc(local, sdata, true); } /* inform AP that we are awake again, unless power save is enabled */ @@ -66,7 +66,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; if (!local->ps_sdata) - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); else if (local->offchannel_ps_enabled) { /* * In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware @@ -93,7 +93,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) * restart the timer now and send a nullfunc frame to inform * the AP that we are awake. */ - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 62948bb..60c4dbf 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1966,7 +1966,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (!sdata->u.mgd.associated) continue; - ieee80211_send_nullfunc(local, sdata, 0); + ieee80211_send_nullfunc(local, sdata, false); } } -- cgit v1.1 From 188515fbc6b18e6bc6f2fa4629f1a77308197371 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Sep 2015 20:08:51 -0500 Subject: openvswitch: Pass net into ovs_vport_output When struct net starts being passed through the ipv4 and ipv6 fragment routines ovs_vport_output will need to take a net parameter. Prepare ovs_vport_output before that is needed and introduce ovs_vport_output_skk for the call sites that still need the old calling conventions. Signed-off-by: "Eric W. Biederman" --- net/openvswitch/actions.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 315f533..f00c641 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -620,7 +620,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static int ovs_vport_output(struct sock *sock, struct sk_buff *skb) +static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); struct vport *vport = data->vport; @@ -645,6 +645,11 @@ static int ovs_vport_output(struct sock *sock, struct sk_buff *skb) ovs_vport_send(vport, skb); return 0; } +static int ovs_vport_output_sk(struct sock *sk, struct sk_buff *skb) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + return ovs_vport_output(net, sk, skb); +} static unsigned int ovs_dst_get_mtu(const struct dst_entry *dst) @@ -700,7 +705,7 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, skb_dst_set_noref(skb, &ovs_dst); IPCB(skb)->frag_max_size = mru; - ip_do_fragment(skb->sk, skb, ovs_vport_output); + ip_do_fragment(skb->sk, skb, ovs_vport_output_sk); refdst_drop(orig_dst); } else if (ethertype == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); @@ -722,7 +727,7 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, skb_dst_set_noref(skb, &ovs_rt.dst); IP6CB(skb)->frag_max_size = mru; - v6ops->fragment(skb->sk, skb, ovs_vport_output); + v6ops->fragment(skb->sk, skb, ovs_vport_output_sk); refdst_drop(orig_dst); } else { WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", -- cgit v1.1 From c559cd3ad32ba729bb810283c5fc6838d2473c2e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Sep 2015 20:10:28 -0500 Subject: openvswitch: Pass net into ovs_fragment In preparation for the ipv4 and ipv6 fragmentation code taking a net parameter pass a struct net into ovs_fragment where the v4 and v6 fragmentation code is called. Signed-off-by: "Eric W. Biederman" --- net/openvswitch/actions.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index f00c641..ba38662 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -684,8 +684,8 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb) skb_pull(skb, hlen); } -static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, - __be16 ethertype) +static void ovs_fragment(struct net *net, struct vport *vport, + struct sk_buff *skb, u16 mru, __be16 ethertype) { if (skb_network_offset(skb) > MAX_L2_LEN) { OVS_NLERR(1, "L2 header too long to fragment"); @@ -748,6 +748,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { ovs_vport_send(vport, skb); } else if (mru <= vport->dev->mtu) { + struct net *net = read_pnet(&dp->net); __be16 ethertype = key->eth.type; if (!is_flow_key_valid(key)) { @@ -757,7 +758,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, ethertype = vlan_get_protocol(skb); } - ovs_fragment(vport, skb, mru, ethertype); + ovs_fragment(net, vport, skb, mru, ethertype); } else { kfree_skb(skb); } -- cgit v1.1 From 694869b3c5440e0d821583ec8811b6cb5d03742d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 12 Jun 2015 21:55:31 -0500 Subject: ipv4: Pass struct net through ip_fragment Signed-off-by: "Eric W. Biederman" --- net/bridge/br_netfilter_hooks.c | 6 +++--- net/ipv4/ip_output.c | 44 +++++++++++++++++++---------------------- net/openvswitch/actions.c | 2 +- 3 files changed, 24 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 13f0367..00e356c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -701,7 +701,7 @@ static int br_nf_push_frag_xmit_sk(struct sock *sk, struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) static int br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) + int (*output)(struct net *, struct sock *, struct sk_buff *)) { unsigned int mtu = ip_skb_dst_mtu(skb); struct iphdr *iph = ip_hdr(skb); @@ -714,7 +714,7 @@ br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } #endif @@ -763,7 +763,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit_sk); + return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); } #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index aff6766..911ea73 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,9 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); -static int ip_fragment(struct sock *sk, struct sk_buff *skb, - unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)); +static int +ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) @@ -176,12 +177,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; - struct net *net = dev_net(dev); unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; u32 nexthop; @@ -225,8 +225,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, - unsigned int mtu) +static int ip_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -235,7 +235,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || skb_gso_network_seglen(skb) <= mtu) - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -258,7 +258,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, int err; segs->next = NULL; - err = ip_fragment(sk, segs, mtu, ip_finish_output2); + err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -281,12 +281,12 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk #endif mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb, mtu); + return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) - return ip_fragment(sk, skb, mtu, ip_finish_output2); + return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); } int ip_mc_output(struct sock *sk, struct sk_buff *skb) @@ -495,20 +495,18 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -static int ip_fragment(struct sock *sk, struct sk_buff *skb, +static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)) + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { - struct net *net = dev_net(skb_rtable(skb)->dst.dev); - IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -516,7 +514,7 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } /* @@ -526,8 +524,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, * single device frame, and queue such a frame for sending. */ -int ip_do_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -537,11 +535,9 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, int offset; __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); - struct net *net; int err = 0; dev = rt->dst.dev; - net = dev_net(dev); /* * Point into the IP datagram header. @@ -631,7 +627,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, ip_send_check(iph); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); @@ -771,7 +767,7 @@ slow_path: ip_send_check(iph); - err = output(sk, skb2); + err = output(net, sk, skb2); if (err) goto fail; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ba38662..b281b2b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -705,7 +705,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, skb_dst_set_noref(skb, &ovs_dst); IPCB(skb)->frag_max_size = mru; - ip_do_fragment(skb->sk, skb, ovs_vport_output_sk); + ip_do_fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); } else if (ethertype == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); -- cgit v1.1 From 7d8c6e391575ee86c870b88635a163743fca9eac Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 12 Jun 2015 22:12:04 -0500 Subject: ipv6: Pass struct net through ip6_fragment Signed-off-by: Eric W. Biederman --- net/bridge/br_netfilter_hooks.c | 2 +- net/ipv6/ip6_output.c | 16 +++++++--------- net/ipv6/xfrm6_output.c | 10 ++++++++-- net/openvswitch/actions.c | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 00e356c..815994d 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -786,7 +786,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff data->size); if (v6ops) - return v6ops->fragment(sk, skb, br_nf_push_frag_xmit_sk); + return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a598fe2..caf7d14 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,11 +56,10 @@ #include #include -static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; - struct net *net = dev_net(dev); struct neighbour *neigh; struct in6_addr *nexthop; int ret; @@ -126,9 +125,9 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) - return ip6_fragment(sk, skb, ip6_finish_output2); + return ip6_fragment(net, sk, skb, ip6_finish_output2); else - return ip6_finish_output2(sk, skb); + return ip6_finish_output2(net, sk, skb); } int ip6_output(struct sock *sk, struct sk_buff *skb) @@ -554,8 +553,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -int ip6_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); @@ -568,7 +567,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, __be32 frag_id; int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - struct net *net = dev_net(skb_dst(skb)->dev); hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; @@ -688,7 +686,7 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, ip6_copy_metadata(frag, skb); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); @@ -816,7 +814,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - err = output(sk, frag); + err = output(net, sk, frag); if (err) goto fail; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 0c3e9ff..335066a 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -131,6 +131,12 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } +static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + return x->outer_mode->afinfo->output_finish(sk, skb); +} + static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -160,8 +166,8 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (x->props.mode == XFRM_MODE_TUNNEL && ((skb->len > mtu && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(sk, skb, - x->outer_mode->afinfo->output_finish); + return ip6_fragment(net, sk, skb, + __xfrm6_output_finish); } return x->outer_mode->afinfo->output_finish(sk, skb); } diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b281b2b..f33c627 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -727,7 +727,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, skb_dst_set_noref(skb, &ovs_rt.dst); IP6CB(skb)->frag_max_size = mru; - v6ops->fragment(skb->sk, skb, ovs_vport_output_sk); + v6ops->fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); } else { WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", -- cgit v1.1 From 75aec9df3a7895747a0d022b7c83a1dfb2adf942 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Sep 2015 13:46:16 -0500 Subject: bridge: Remove br_nf_push_frag_xmit_sk Now that this compatability function no longer has any callers remove it. Signed-off-by: "Eric W. Biederman" --- net/bridge/br_netfilter_hooks.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 815994d..370aa4d 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -691,11 +691,6 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } -static int br_nf_push_frag_xmit_sk(struct sock *sk, struct sk_buff *skb) -{ - struct net *net = dev_net(skb_dst(skb)->dev); - return br_nf_push_frag_xmit(net, sk, skb); -} #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) -- cgit v1.1 From 184e16d79d38634cbb7b7c1cd3832caf89595c9a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Sep 2015 20:14:45 -0500 Subject: openvswitch: Remove ovs_vport_output_sk This was a compatibility function needed while the ipv4 and ipv6 fragmentation code was being modified to pass a struct net through them. Now that is complete this function has no more users so remove it. Signed-off-by: "Eric W. Biederman" --- net/openvswitch/actions.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index f33c627..1d21ab9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -645,11 +645,6 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk ovs_vport_send(vport, skb); return 0; } -static int ovs_vport_output_sk(struct sock *sk, struct sk_buff *skb) -{ - struct net *net = dev_net(skb_dst(skb)->dev); - return ovs_vport_output(net, sk, skb); -} static unsigned int ovs_dst_get_mtu(const struct dst_entry *dst) -- cgit v1.1 From 1ee06ef1596dcc5858ea29ef9faf0f29e139dfcc Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 28 Sep 2015 09:00:24 +0200 Subject: nl802154: use nla_get_le64 for get extended addr This patch uses the nla_get_le64 function instead of doing a force converting to le64. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 3f89c0a..51110a6 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -753,10 +753,8 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - /* TODO add nla_get_le64 to netlink */ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) - extended_addr = (__force __le64)nla_get_u64( - info->attrs[NL802154_ATTR_EXTENDED_ADDR]); + extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); if (!rdev->ops->add_virtual_intf) return -EOPNOTSUPP; -- cgit v1.1 From a26c5fd7622d4951425131d54a8c99f076fe2068 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 28 Sep 2015 09:00:25 +0200 Subject: nl802154: add support for security layer This patch adds support for accessing mac802154 llsec implementation over nl802154. I added for a new Kconfig entry to provide this functionality CONFIG_IEEE802154_NL802154_EXPERIMENTAL. This interface is still in development. It provides to change security parameters and add/del/dump entries of security tables. Later we can add also a get to get an entry by unique identifier. Cc: Phoebe Buckheister Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/Kconfig | 5 + net/ieee802154/core.c | 12 + net/ieee802154/core.h | 1 + net/ieee802154/nl802154.c | 1316 +++++++++++++++++++++++++++++++++++++++++---- net/ieee802154/rdev-ops.h | 109 ++++ net/mac802154/cfg.c | 205 +++++++ 6 files changed, 1554 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig index 1370d5b..188135b 100644 --- a/net/ieee802154/Kconfig +++ b/net/ieee802154/Kconfig @@ -12,6 +12,11 @@ menuconfig IEEE802154 if IEEE802154 +config IEEE802154_NL802154_EXPERIMENTAL + bool "IEEE 802.15.4 experimental netlink support" + ---help--- + Adds experimental netlink support for nl802154. + config IEEE802154_SOCKET tristate "IEEE 802.15.4 socket interface" default y diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index b0248e9..c35fdfa 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -95,6 +95,18 @@ cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx) return result; } +struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx) +{ + struct cfg802154_registered_device *rdev; + + ASSERT_RTNL(); + + rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx); + if (!rdev) + return NULL; + return &rdev->wpan_phy; +} + struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) { diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h index f3e9558..231fade 100644 --- a/net/ieee802154/core.h +++ b/net/ieee802154/core.h @@ -42,5 +42,6 @@ extern int cfg802154_rdev_list_generation; void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); +struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */ diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 51110a6..1e9e865 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -232,8 +232,86 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, + [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, + [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, }, + [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 }, + + [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED }, + [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED }, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static int +nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg802154_registered_device **rdev, + struct wpan_dev **wpan_dev) +{ + int err; + + rtnl_lock(); + + if (!cb->args[0]) { + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, + nl802154_fam.attrbuf, nl802154_fam.maxattr, + nl802154_policy); + if (err) + goto out_unlock; + + *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), + nl802154_fam.attrbuf); + if (IS_ERR(*wpan_dev)) { + err = PTR_ERR(*wpan_dev); + goto out_unlock; + } + *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy); + /* 0 is the first index - add 1 to parse only once */ + cb->args[0] = (*rdev)->wpan_phy_idx + 1; + cb->args[1] = (*wpan_dev)->identifier; + } else { + /* subtract the 1 again here */ + struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1); + struct wpan_dev *tmp; + + if (!wpan_phy) { + err = -ENODEV; + goto out_unlock; + } + *rdev = wpan_phy_to_rdev(wpan_phy); + *wpan_dev = NULL; + + list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) { + if (tmp->identifier == cb->args[1]) { + *wpan_dev = tmp; + break; + } + } + + if (!*wpan_dev) { + err = -ENODEV; + goto out_unlock; + } + } + + return 0; + out_unlock: + rtnl_unlock(); + return err; +} + +static void +nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev) +{ + rtnl_unlock(); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + /* message building helper */ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) @@ -612,6 +690,107 @@ static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev) ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32); } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +#include + +static int +ieee802154_llsec_send_key_id(struct sk_buff *msg, + const struct ieee802154_llsec_key_id *desc) +{ + struct nlattr *nl_dev_addr; + + if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode)) + return -ENOBUFS; + + switch (desc->mode) { + case NL802154_KEY_ID_MODE_IMPLICIT: + nl_dev_addr = nla_nest_start(msg, NL802154_KEY_ID_ATTR_IMPLICIT); + if (!nl_dev_addr) + return -ENOBUFS; + + if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID, + desc->device_addr.pan_id) || + nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE, + desc->device_addr.mode)) + return -ENOBUFS; + + switch (desc->device_addr.mode) { + case NL802154_DEV_ADDR_SHORT: + if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT, + desc->device_addr.short_addr)) + return -ENOBUFS; + break; + case NL802154_DEV_ADDR_EXTENDED: + if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, + desc->device_addr.extended_addr)) + return -ENOBUFS; + break; + default: + /* userspace should handle unknown */ + break; + } + + nla_nest_end(msg, nl_dev_addr); + break; + case NL802154_KEY_ID_MODE_INDEX: + break; + case NL802154_KEY_ID_MODE_INDEX_SHORT: + /* TODO renmae short_source? */ + if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT, + desc->short_source)) + return -ENOBUFS; + break; + case NL802154_KEY_ID_MODE_INDEX_EXTENDED: + if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED, + desc->extended_source)) + return -ENOBUFS; + break; + default: + /* userspace should handle unknown */ + break; + } + + /* TODO key_id to key_idx ? Check naming */ + if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { + if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id)) + return -ENOBUFS; + } + + return 0; +} + +static int nl802154_get_llsec_params(struct sk_buff *msg, + struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + struct nlattr *nl_key_id; + struct ieee802154_llsec_params params; + int ret; + + ret = rdev_get_llsec_params(rdev, wpan_dev, ¶ms); + if (ret < 0) + return ret; + + if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) || + nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) || + nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER, + params.frame_counter)) + return -ENOBUFS; + + nl_key_id = nla_nest_start(msg, NL802154_ATTR_SEC_OUT_KEY_ID); + if (!nl_key_id) + return -ENOBUFS; + + ret = ieee802154_llsec_send_key_id(msg, ¶ms.out_key); + if (ret < 0) + return ret; + + nla_nest_end(msg, nl_key_id); + + return 0; +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + static int nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, @@ -663,6 +842,11 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq)) goto nla_put_failure; +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0) + goto nla_put_failure; +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + genlmsg_end(msg, hdr); return 0; @@ -1073,122 +1257,953 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } -#define NL802154_FLAG_NEED_WPAN_PHY 0x01 -#define NL802154_FLAG_NEED_NETDEV 0x02 -#define NL802154_FLAG_NEED_RTNL 0x04 -#define NL802154_FLAG_CHECK_NETDEV_UP 0x08 -#define NL802154_FLAG_NEED_NETDEV_UP (NL802154_FLAG_NEED_NETDEV |\ - NL802154_FLAG_CHECK_NETDEV_UP) -#define NL802154_FLAG_NEED_WPAN_DEV 0x10 -#define NL802154_FLAG_NEED_WPAN_DEV_UP (NL802154_FLAG_NEED_WPAN_DEV |\ - NL802154_FLAG_CHECK_NETDEV_UP) +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { + [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, + [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 }, + [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 }, + [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 }, +}; -static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) +static int +ieee802154_llsec_parse_dev_addr(struct nlattr *nla, + struct ieee802154_addr *addr) { - struct cfg802154_registered_device *rdev; - struct wpan_dev *wpan_dev; - struct net_device *dev; - bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL; + struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; - if (rtnl) - rtnl_lock(); + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, + nl802154_dev_addr_policy)) + return -EINVAL; - if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) { - rdev = cfg802154_get_dev_from_info(genl_info_net(info), info); - if (IS_ERR(rdev)) { - if (rtnl) - rtnl_unlock(); - return PTR_ERR(rdev); - } - info->user_ptr[0] = rdev; - } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV || - ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { - ASSERT_RTNL(); - wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info), - info->attrs); - if (IS_ERR(wpan_dev)) { - if (rtnl) - rtnl_unlock(); - return PTR_ERR(wpan_dev); - } + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] && + !attrs[NL802154_DEV_ADDR_ATTR_MODE] && + !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || + attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + return -EINVAL; - dev = wpan_dev->netdev; - rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); + addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); + addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); + switch (addr->mode) { + case NL802154_DEV_ADDR_SHORT: + addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); + break; + case NL802154_DEV_ADDR_EXTENDED: + addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); + break; + default: + return -EINVAL; + } - if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) { - if (!dev) { - if (rtnl) - rtnl_unlock(); - return -EINVAL; - } + return 0; +} - info->user_ptr[1] = dev; - } else { - info->user_ptr[1] = wpan_dev; - } +static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = { + [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 }, + [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 }, + [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED }, + [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 }, + [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 }, +}; - if (dev) { - if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP && - !netif_running(dev)) { - if (rtnl) - rtnl_unlock(); - return -ENETDOWN; - } +static int +ieee802154_llsec_parse_key_id(struct nlattr *nla, + struct ieee802154_llsec_key_id *desc) +{ + struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; - dev_hold(dev); - } + if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla, + nl802154_key_id_policy)) + return -EINVAL; - info->user_ptr[0] = rdev; + if (!attrs[NL802154_KEY_ID_ATTR_MODE]) + return -EINVAL; + + desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]); + switch (desc->mode) { + case NL802154_KEY_ID_MODE_IMPLICIT: + if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT]) + return -EINVAL; + + if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT], + &desc->device_addr) < 0) + return -EINVAL; + break; + case NL802154_KEY_ID_MODE_INDEX: + break; + case NL802154_KEY_ID_MODE_INDEX_SHORT: + if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]) + return -EINVAL; + + desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]); + break; + case NL802154_KEY_ID_MODE_INDEX_EXTENDED: + if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]) + return -EINVAL; + + desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]); + break; + default: + return -EINVAL; + } + + if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { + if (!attrs[NL802154_KEY_ID_ATTR_INDEX]) + return -EINVAL; + + /* TODO change id to idx */ + desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]); } return 0; } -static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) +static int nl802154_set_llsec_params(struct sk_buff *skb, + struct genl_info *info) { - if (info->user_ptr[1]) { - if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { - struct wpan_dev *wpan_dev = info->user_ptr[1]; + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_params params; + u32 changed = 0; + int ret; - if (wpan_dev->netdev) - dev_put(wpan_dev->netdev); - } else { - dev_put(info->user_ptr[1]); + if (info->attrs[NL802154_ATTR_SEC_ENABLED]) { + u8 enabled; + + enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); + if (enabled != 0 && enabled != 1) + return -EINVAL; + + params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); + changed |= IEEE802154_LLSEC_PARAM_ENABLED; + } + + if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) { + ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID], + ¶ms.out_key); + if (ret < 0) + return ret; + + changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; + } + + if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) { + params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]); + if (params.out_level > NL802154_SECLEVEL_MAX) + return -EINVAL; + + changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; + } + + if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) { + params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]); + changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; + } + + return rdev_set_llsec_params(rdev, wpan_dev, ¶ms, changed); +} + +static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_key_entry *key) +{ + void *hdr; + u32 commands[NL802154_CMD_FRAME_NR_IDS / 32]; + struct nlattr *nl_key, *nl_key_id; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_key = nla_nest_start(msg, NL802154_ATTR_SEC_KEY); + if (!nl_key) + goto nla_put_failure; + + nl_key_id = nla_nest_start(msg, NL802154_KEY_ATTR_ID); + if (!nl_key_id) + goto nla_put_failure; + + if (ieee802154_llsec_send_key_id(msg, &key->id) < 0) + goto nla_put_failure; + + nla_nest_end(msg, nl_key_id); + + if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES, + key->key->frame_types)) + goto nla_put_failure; + + if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) { + /* TODO for each nested */ + memset(commands, 0, sizeof(commands)); + commands[7] = key->key->cmd_frame_ids; + if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS, + sizeof(commands), commands)) + goto nla_put_failure; + } + + if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE, + key->key->key)) + goto nla_put_failure; + + nla_nest_end(msg, nl_key); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_key_entry *key; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(key, &table->keys, list) { + if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, key) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; } } - if (ops->internal_flags & NL802154_FLAG_NEED_RTNL) - rtnl_unlock(); + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; } -static const struct genl_ops nl802154_ops[] = { - { - .cmd = NL802154_CMD_GET_WPAN_PHY, - .doit = nl802154_get_wpan_phy, - .dumpit = nl802154_dump_wpan_phy, - .done = nl802154_dump_wpan_phy_done, - .policy = nl802154_policy, - /* can be retrieved by unprivileged users */ - .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | - NL802154_FLAG_NEED_RTNL, - }, - { - .cmd = NL802154_CMD_GET_INTERFACE, - .doit = nl802154_get_interface, - .dumpit = nl802154_dump_interface, - .policy = nl802154_policy, - /* can be retrieved by unprivileged users */ - .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | - NL802154_FLAG_NEED_RTNL, - }, - { - .cmd = NL802154_CMD_NEW_INTERFACE, - .doit = nl802154_new_interface, - .policy = nl802154_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | +static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = { + [NL802154_KEY_ATTR_ID] = { NLA_NESTED }, + /* TODO handle it as for_each_nested and NLA_FLAG? */ + [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 }, + /* TODO handle it as for_each_nested, not static array? */ + [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 }, + [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE }, +}; + +static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; + struct ieee802154_llsec_key key = { }; + struct ieee802154_llsec_key_id id = { }; + u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; + + if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_KEY], + nl802154_key_policy)) + return -EINVAL; + + if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || + !attrs[NL802154_KEY_ATTR_BYTES]) + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]); + if (key.frame_types > BIT(NL802154_FRAME_MAX) || + ((key.frame_types & BIT(NL802154_FRAME_CMD)) && + !attrs[NL802154_KEY_ATTR_USAGE_CMDS])) + return -EINVAL; + + if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) { + /* TODO for each nested */ + nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS], + NL802154_CMD_FRAME_NR_IDS / 8); + + /* TODO understand the -EINVAL logic here? last condition */ + if (commands[0] || commands[1] || commands[2] || commands[3] || + commands[4] || commands[5] || commands[6] || + commands[7] > BIT(NL802154_CMD_FRAME_MAX)) + return -EINVAL; + + key.cmd_frame_ids = commands[7]; + } else { + key.cmd_frame_ids = 0; + } + + nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE); + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + return rdev_add_llsec_key(rdev, wpan_dev, &id, &key); +} + +static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; + struct ieee802154_llsec_key_id id; + + if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_KEY], + nl802154_key_policy)) + return -EINVAL; + + if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) + return -ENOBUFS; + + return rdev_del_llsec_key(rdev, wpan_dev, &id); +} + +static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_device *dev_desc) +{ + void *hdr; + struct nlattr *nl_device; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_device = nla_nest_start(msg, NL802154_ATTR_SEC_DEVICE); + if (!nl_device) + goto nla_put_failure; + + if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER, + dev_desc->frame_counter) || + nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) || + nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR, + dev_desc->short_addr) || + nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR, + dev_desc->hwaddr) || + nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT, + dev_desc->seclevel_exempt) || + nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode)) + goto nla_put_failure; + + nla_nest_end(msg, nl_device); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_device *dev; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(dev, &table->devices, list) { + if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, dev) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = { + [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 }, + [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 }, + [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 }, + [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, + [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 }, + [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 }, +}; + +static int +ieee802154_llsec_parse_device(struct nlattr *nla, + struct ieee802154_llsec_device *dev) +{ + struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, nla, + nl802154_dev_policy)) + return -EINVAL; + + memset(dev, 0, sizeof(*dev)); + + if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] || + !attrs[NL802154_DEV_ATTR_PAN_ID] || + !attrs[NL802154_DEV_ATTR_SHORT_ADDR] || + !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] || + !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] || + !attrs[NL802154_DEV_ATTR_KEY_MODE]) + return -EINVAL; + + /* TODO be32 */ + dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]); + dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]); + dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]); + /* TODO rename hwaddr to extended_addr */ + dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); + dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]); + dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]); + + if (dev->key_mode > NL802154_DEVKEY_MAX || + (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1)) + return -EINVAL; + + return 0; +} + +static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_device dev_desc; + + if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE], + &dev_desc) < 0) + return -EINVAL; + + return rdev_add_device(rdev, wpan_dev, &dev_desc); +} + +static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; + __le64 extended_addr; + + if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVICE], + nl802154_dev_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); + return rdev_del_device(rdev, wpan_dev, extended_addr); +} + +static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + void *hdr; + struct nlattr *nl_devkey, *nl_key_id; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_devkey = nla_nest_start(msg, NL802154_ATTR_SEC_DEVKEY); + if (!nl_devkey) + goto nla_put_failure; + + if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR, + extended_addr) || + nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER, + devkey->frame_counter)) + goto nla_put_failure; + + nl_key_id = nla_nest_start(msg, NL802154_DEVKEY_ATTR_ID); + if (!nl_key_id) + goto nla_put_failure; + + if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0) + goto nla_put_failure; + + nla_nest_end(msg, nl_key_id); + nla_nest_end(msg, nl_devkey); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_device_key *kpos; + struct ieee802154_llsec_device *dpos; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + /* TODO look if remove devkey and do some nested attribute */ + list_for_each_entry(dpos, &table->devices, list) { + list_for_each_entry(kpos, &dpos->keys, list) { + if (nl802154_send_devkey(skb, + NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, rdev, + wpan_dev->netdev, + dpos->hwaddr, + kpos) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = { + [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 }, + [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 }, + [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED }, +}; + +static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; + struct ieee802154_llsec_device_key key; + __le64 extended_addr; + + if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || + nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVKEY], + nl802154_devkey_policy) < 0) + return -EINVAL; + + if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || + !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + /* TODO change key.id ? */ + if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], + &key.key_id) < 0) + return -ENOBUFS; + + /* TODO be32 */ + key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]); + /* TODO change naming hwaddr -> extended_addr + * check unique identifier short+pan OR extended_addr + */ + extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); + return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key); +} + +static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; + struct ieee802154_llsec_device_key key; + __le64 extended_addr; + + if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, + info->attrs[NL802154_ATTR_SEC_DEVKEY], + nl802154_devkey_policy)) + return -EINVAL; + + if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + /* TODO change key.id ? */ + if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], + &key.key_id) < 0) + return -ENOBUFS; + + /* TODO change naming hwaddr -> extended_addr + * check unique identifier short+pan OR extended_addr + */ + extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); + return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key); +} + +static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct net_device *dev, + const struct ieee802154_llsec_seclevel *sl) +{ + void *hdr; + struct nlattr *nl_seclevel; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -1; + + if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + nl_seclevel = nla_nest_start(msg, NL802154_ATTR_SEC_LEVEL); + if (!nl_seclevel) + goto nla_put_failure; + + if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) || + nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) || + nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE, + sl->device_override)) + goto nla_put_failure; + + if (sl->frame_type == NL802154_FRAME_CMD) { + if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME, + sl->cmd_frame_id)) + goto nla_put_failure; + } + + nla_nest_end(msg, nl_seclevel); + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev = NULL; + struct ieee802154_llsec_seclevel *sl; + struct ieee802154_llsec_table *table; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + if (!wpan_dev->netdev) { + err = -EINVAL; + goto out_err; + } + + rdev_lock_llsec_table(rdev, wpan_dev); + rdev_get_llsec_table(rdev, wpan_dev, &table); + + /* TODO make it like station dump */ + if (cb->args[2]) + goto out; + + list_for_each_entry(sl, &table->security_levels, list) { + if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wpan_dev->netdev, sl) < 0) { + /* TODO */ + err = -EIO; + rdev_unlock_llsec_table(rdev, wpan_dev); + goto out_err; + } + } + + cb->args[2] = 1; + +out: + rdev_unlock_llsec_table(rdev, wpan_dev); + err = skb->len; +out_err: + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + +static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = { + [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 }, + [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 }, + [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 }, + [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 }, +}; + +static int +llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) +{ + struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; + + if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, + nl802154_seclevel_policy)) + return -EINVAL; + + memset(sl, 0, sizeof(*sl)); + + if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] || + !attrs[NL802154_SECLEVEL_ATTR_FRAME] || + !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]) + return -EINVAL; + + sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]); + sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]); + sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]); + if (sl->frame_type > NL802154_FRAME_MAX || + (sl->device_override != 0 && sl->device_override != 1)) + return -EINVAL; + + if (sl->frame_type == NL802154_FRAME_CMD) { + if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]) + return -EINVAL; + + sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]); + if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX) + return -EINVAL; + } + + return 0; +} + +static int nl802154_add_llsec_seclevel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_seclevel sl; + + if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], + &sl) < 0) + return -EINVAL; + + return rdev_add_seclevel(rdev, wpan_dev, &sl); +} + +static int nl802154_del_llsec_seclevel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct ieee802154_llsec_seclevel sl; + + if (!info->attrs[NL802154_ATTR_SEC_LEVEL] || + llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], + &sl) < 0) + return -EINVAL; + + return rdev_del_seclevel(rdev, wpan_dev, &sl); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + +#define NL802154_FLAG_NEED_WPAN_PHY 0x01 +#define NL802154_FLAG_NEED_NETDEV 0x02 +#define NL802154_FLAG_NEED_RTNL 0x04 +#define NL802154_FLAG_CHECK_NETDEV_UP 0x08 +#define NL802154_FLAG_NEED_NETDEV_UP (NL802154_FLAG_NEED_NETDEV |\ + NL802154_FLAG_CHECK_NETDEV_UP) +#define NL802154_FLAG_NEED_WPAN_DEV 0x10 +#define NL802154_FLAG_NEED_WPAN_DEV_UP (NL802154_FLAG_NEED_WPAN_DEV |\ + NL802154_FLAG_CHECK_NETDEV_UP) + +static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg802154_registered_device *rdev; + struct wpan_dev *wpan_dev; + struct net_device *dev; + bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL; + + if (rtnl) + rtnl_lock(); + + if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) { + rdev = cfg802154_get_dev_from_info(genl_info_net(info), info); + if (IS_ERR(rdev)) { + if (rtnl) + rtnl_unlock(); + return PTR_ERR(rdev); + } + info->user_ptr[0] = rdev; + } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV || + ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { + ASSERT_RTNL(); + wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info), + info->attrs); + if (IS_ERR(wpan_dev)) { + if (rtnl) + rtnl_unlock(); + return PTR_ERR(wpan_dev); + } + + dev = wpan_dev->netdev; + rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); + + if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) { + if (!dev) { + if (rtnl) + rtnl_unlock(); + return -EINVAL; + } + + info->user_ptr[1] = dev; + } else { + info->user_ptr[1] = wpan_dev; + } + + if (dev) { + if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP && + !netif_running(dev)) { + if (rtnl) + rtnl_unlock(); + return -ENETDOWN; + } + + dev_hold(dev); + } + + info->user_ptr[0] = rdev; + } + + return 0; +} + +static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + if (info->user_ptr[1]) { + if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { + struct wpan_dev *wpan_dev = info->user_ptr[1]; + + if (wpan_dev->netdev) + dev_put(wpan_dev->netdev); + } else { + dev_put(info->user_ptr[1]); + } + } + + if (ops->internal_flags & NL802154_FLAG_NEED_RTNL) + rtnl_unlock(); +} + +static const struct genl_ops nl802154_ops[] = { + { + .cmd = NL802154_CMD_GET_WPAN_PHY, + .doit = nl802154_get_wpan_phy, + .dumpit = nl802154_dump_wpan_phy, + .done = nl802154_dump_wpan_phy_done, + .policy = nl802154_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_INTERFACE, + .doit = nl802154_get_interface, + .dumpit = nl802154_dump_interface, + .policy = nl802154_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_INTERFACE, + .doit = nl802154_new_interface, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { @@ -1287,6 +2302,119 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + { + .cmd = NL802154_CMD_SET_SEC_PARAMS, + .doit = nl802154_set_llsec_params, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_SEC_KEY, + /* TODO .doit by matching key id? */ + .dumpit = nl802154_dump_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_KEY, + .doit = nl802154_add_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_KEY, + .doit = nl802154_del_llsec_key, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + /* TODO unique identifier must short+pan OR extended_addr */ + { + .cmd = NL802154_CMD_GET_SEC_DEV, + /* TODO .doit by matching extended_addr? */ + .dumpit = nl802154_dump_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_DEV, + .doit = nl802154_add_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_DEV, + .doit = nl802154_del_llsec_dev, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + /* TODO remove complete devkey, put it as nested? */ + { + .cmd = NL802154_CMD_GET_SEC_DEVKEY, + /* TODO doit by matching ??? */ + .dumpit = nl802154_dump_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_DEVKEY, + .doit = nl802154_add_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_DEVKEY, + .doit = nl802154_del_llsec_devkey, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_GET_SEC_LEVEL, + /* TODO .doit by matching frame_type? */ + .dumpit = nl802154_dump_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_NEW_SEC_LEVEL, + .doit = nl802154_add_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DEL_SEC_LEVEL, + /* TODO match frame_type only? */ + .doit = nl802154_del_llsec_seclevel, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; /* initialisation/exit functions */ diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 03b3575..4441c63 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -208,4 +208,113 @@ rdev_set_ackreq_default(struct cfg802154_registered_device *rdev, return ret; } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +/* TODO this is already a nl802154, so move into ieee802154 */ +static inline void +rdev_get_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_table **table) +{ + rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table); +} + +static inline void +rdev_lock_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev); +} + +static inline void +rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev) +{ + rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev); +} + +static inline int +rdev_get_llsec_params(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_params *params) +{ + return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params); +} + +static inline int +rdev_set_llsec_params(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_params *params, + u32 changed) +{ + return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params, + changed); +} + +static inline int +rdev_add_llsec_key(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key); +} + +static inline int +rdev_del_llsec_key(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id) +{ + return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id); +} + +static inline int +rdev_add_seclevel(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl); +} + +static inline int +rdev_del_seclevel(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl); +} + +static inline int +rdev_add_device(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_device *dev_desc) +{ + return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc); +} + +static inline int +rdev_del_device(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr) +{ + return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr); +} + +static inline int +rdev_add_devkey(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, + devkey); +} + +static inline int +rdev_del_devkey(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, __le64 extended_addr, + const struct ieee802154_llsec_device_key *devkey) +{ + return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, + devkey); +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + #endif /* __CFG802154_RDEV_OPS */ diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index c865ebb..57b5e94 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -266,6 +266,195 @@ ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy, return 0; } +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL +static void +ieee802154_get_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_table **table) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + *table = &sdata->sec.table; +} + +static void +ieee802154_lock_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + mutex_lock(&sdata->sec_mtx); +} + +static void +ieee802154_unlock_llsec_table(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + + mutex_unlock(&sdata->sec_mtx); +} + +static int +ieee802154_set_llsec_params(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_params *params, + int changed) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_set_params(&sdata->sec, params, changed); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_get_llsec_params(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_llsec_params *params) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_get_params(&sdata->sec, params); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id, + const struct ieee802154_llsec_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_key_add(&sdata->sec, id, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_key_id *id) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_key_del(&sdata->sec, id); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_seclevel_add(&sdata->sec, sl); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_seclevel *sl) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_seclevel_del(&sdata->sec, sl); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + const struct ieee802154_llsec_device *dev_desc) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_dev_add(&sdata->sec, dev_desc); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_dev_del(&sdata->sec, extended_addr); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_add_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_devkey_add(&sdata->sec, extended_addr, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} + +static int +ieee802154_del_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le64 extended_addr, + const struct ieee802154_llsec_device_key *key) +{ + struct net_device *dev = wpan_dev->netdev; + struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); + int res; + + mutex_lock(&sdata->sec_mtx); + res = mac802154_llsec_devkey_del(&sdata->sec, extended_addr, key); + mutex_unlock(&sdata->sec_mtx); + + return res; +} +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + const struct cfg802154_ops mac802154_config_ops = { .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated, .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated, @@ -284,4 +473,20 @@ const struct cfg802154_ops mac802154_config_ops = { .set_max_frame_retries = ieee802154_set_max_frame_retries, .set_lbt_mode = ieee802154_set_lbt_mode, .set_ackreq_default = ieee802154_set_ackreq_default, +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + .get_llsec_table = ieee802154_get_llsec_table, + .lock_llsec_table = ieee802154_lock_llsec_table, + .unlock_llsec_table = ieee802154_unlock_llsec_table, + /* TODO above */ + .set_llsec_params = ieee802154_set_llsec_params, + .get_llsec_params = ieee802154_get_llsec_params, + .add_llsec_key = ieee802154_add_llsec_key, + .del_llsec_key = ieee802154_del_llsec_key, + .add_seclevel = ieee802154_add_seclevel, + .del_seclevel = ieee802154_del_seclevel, + .add_device = ieee802154_add_device, + .del_device = ieee802154_del_device, + .add_devkey = ieee802154_add_devkey, + .del_devkey = ieee802154_del_devkey, +#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; -- cgit v1.1 From d58a2fa903c18f97aac30cd3c4c8a378a2c647c4 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 28 Sep 2015 09:00:26 +0200 Subject: mac802154: add comments for llsec issues While doing a little test with the llsec implementation I saw these issues. We should move decryption and encruption somewhere else, otherwise while capturing with wireshark the mac header shows secuirty fields but the payload is plaintext. A complete other issue is what doing with HardMAC drivers where the payload is always plaintext. I think we need a special handling then in userspace. We currently doesn't support any HardMAC transceivers, so we should fix the first issue for SoftMAC transceivers. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/rx.c | 4 ++++ net/mac802154/tx.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index d1c33c1..42e9672 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -87,6 +87,10 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, skb->dev = sdata->dev; + /* TODO this should be moved after netif_receive_skb call, otherwise + * wireshark will show a mac header with security fields and the + * payload is already decrypted. + */ rc = mac802154_llsec_decrypt(&sdata->sec, skb); if (rc) { pr_debug("decryption failed: %i\n", rc); diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 5ee596e..b205bbe 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -129,6 +129,10 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int rc; + /* TODO we should move it to wpan_dev_hard_header and dev_hard_header + * functions. The reason is wireshark will show a mac header which is + * with security fields but the payload is not encrypted. + */ rc = mac802154_llsec_encrypt(&sdata->sec, skb); if (rc) { netdev_warn(dev, "encryption failed: %i\n", rc); -- cgit v1.1 From b40988c438c2405a177ae54ff4baa08c720c296f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 28 Sep 2015 12:36:26 +0200 Subject: ieee802154: change mtu size behaviour This patch changes the mtu size of 802.15.4 interfaces. The current setting is the meaning of the maximum transport unit with mac header, which is 127 bytes according 802.15.4. The linux meaning of the mtu size field is the maximum payload of a mac frame. Like in ethernet, which is 1500 bytes. We have dynamic length of mac frames in 802.15.4, this is why we assume the minimum header length which is hard_header_len. This contains fc and sequence fields. These can evaluated by driver layer without additional checks. We currently don't support to set the FCS from userspace, so we need to subtract this from mtu size as well. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/socket.c | 4 ++-- net/mac802154/iface.c | 12 +++++++++++- net/mac802154/tx.c | 11 ----------- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index be77f21..a548be2 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -273,7 +273,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out; } - mtu = dev->mtu; + mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { @@ -637,7 +637,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = -ENXIO; goto out; } - mtu = dev->mtu; + mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 3954bcf..7079cd3 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -547,7 +547,17 @@ static void ieee802154_if_setup(struct net_device *dev) */ dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + IEEE802154_FCS_LEN; - dev->mtu = IEEE802154_MTU; + /* The mtu size is the payload without mac header in this case. + * We have a dynamic length header with a minimum header length + * which is hard_header_len. In this case we let mtu to the size + * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - + * hard_header_len. The FCS which is set by hardware or ndo_start_xmit + * and the minimum mac header which can be evaluated inside driver + * layer. The rest of mac header will be part of payload if greater + * than hard_header_len. + */ + dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - + dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index b205bbe..3827f35 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -71,17 +71,6 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) struct net_device *dev = skb->dev; int ret; - /* This check is for AF_PACKET RAW socket only, which doesn't - * know about the FCS which is set here or by hardware. otherwise - * it should not occur in any case! - * - * TODO: This should be handled in AF_PACKET and return -EMSGSIZE. - */ - if (skb->len > IEEE802154_MTU - IEEE802154_FCS_LEN) { - netdev_warn(dev, "Frame len above MTU limit. Dropped.\n"); - goto err_tx; - } - if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) { u16 crc = crc_ccitt(0, skb->data, skb->len); -- cgit v1.1 From 72d53b116264d5e570f610b3971dae4721aa5c0f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 30 Sep 2015 10:20:09 +0200 Subject: ieee802154: 6lowpan: change datagram var types This patch changes datagram size variable from u16 type to unsigned int. The reason is that an IPv6 header has an MAX_UIN16 payload length, but the datagram size is payload + IPv6 header length. This avoids overflows at some places. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/6lowpan_i.h | 4 ++-- net/ieee802154/6lowpan/reassembly.c | 2 +- net/ieee802154/6lowpan/tx.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index b4e17a7..10d44d0 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -18,7 +18,7 @@ typedef unsigned __bitwise__ lowpan_rx_result; struct lowpan_create_arg { u16 tag; - u16 d_size; + unsigned int d_size; const struct ieee802154_addr *src; const struct ieee802154_addr *dst; }; @@ -29,7 +29,7 @@ struct lowpan_frag_queue { struct inet_frag_queue q; u16 tag; - u16 d_size; + unsigned int d_size; struct ieee802154_addr saddr; struct ieee802154_addr daddr; }; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 12e8cf4..af663cb 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,7 +37,7 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, +static unsigned int lowpan_hash_frag(u16 tag, unsigned int d_size, const struct ieee802154_addr *saddr, const struct ieee802154_addr *daddr) { diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 7e0563e..5ecf8af 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -131,8 +131,8 @@ lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, static int lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, - const struct ieee802154_hdr *wpan_hdr, u16 dgram_size, - u16 dgram_offset) + const struct ieee802154_hdr *wpan_hdr, + unsigned int dgram_size, unsigned int dgram_offset) { __be16 frag_tag; u8 frag_hdr[5]; @@ -194,7 +194,7 @@ err: } static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, - u16 *dgram_size, u16 *dgram_offset) + unsigned int *dgram_size, unsigned int *dgram_offset) { struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr; struct ieee802154_addr sa, da; @@ -244,7 +244,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) { struct ieee802154_hdr wpan_hdr; int max_single, ret; - u16 dgram_size, dgram_offset; + unsigned int dgram_size, dgram_offset; pr_debug("package xmit\n"); -- cgit v1.1 From 4bc8fbc95e0d831e5e3800ecc8a8d5acac79c9a8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 30 Sep 2015 10:20:10 +0200 Subject: ieee802154: 6lowpan: don't skip first dsn while fragmentation This patch fixes the data frame sequence numer (dsn) while 6lowpan fragmentation for frag1. Currently we create one 802.15.4 header at first, then check if it's match into one frame and at the end construct many fragments and calling wpan_dev_hard_header for each of them, inclusive for the first fragment. This will make the first generated header to garbage, instead we copying this header for frag1 instead of generate a new one which skips one dsn. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/tx.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 5ecf8af..3b665e1 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -79,7 +79,7 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, static struct sk_buff* lowpan_alloc_frag(struct sk_buff *skb, int size, - const struct ieee802154_hdr *master_hdr) + const struct ieee802154_hdr *master_hdr, bool frag1) { struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev; struct sk_buff *frag; @@ -95,11 +95,17 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, skb_reset_network_header(frag); *mac_cb(frag) = *mac_cb(skb); - rc = wpan_dev_hard_header(frag, wdev, &master_hdr->dest, - &master_hdr->source, size); - if (rc < 0) { - kfree_skb(frag); - return ERR_PTR(rc); + if (frag1) { + memcpy(skb_put(frag, skb->mac_len), + skb_mac_header(skb), skb->mac_len); + } else { + rc = wpan_dev_hard_header(frag, wdev, + &master_hdr->dest, + &master_hdr->source, size); + if (rc < 0) { + kfree_skb(frag); + return ERR_PTR(rc); + } } } else { frag = ERR_PTR(-ENOMEM); @@ -111,13 +117,13 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, static int lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, u8 *frag_hdr, int frag_hdrlen, - int offset, int len) + int offset, int len, bool frag1) { struct sk_buff *frag; raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen); - frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr); + frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr, frag1); if (IS_ERR(frag)) return PTR_ERR(frag); @@ -156,7 +162,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, LOWPAN_FRAG1_HEAD_SIZE, 0, - frag_len + skb_network_header_len(skb)); + frag_len + skb_network_header_len(skb), + true); if (rc) { pr_debug("%s unable to send FRAG1 packet (tag: %d)", __func__, ntohs(frag_tag)); @@ -177,7 +184,7 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr, LOWPAN_FRAGN_HEAD_SIZE, skb_offset, - frag_len); + frag_len, false); if (rc) { pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n", __func__, ntohs(frag_tag), skb_offset); -- cgit v1.1 From 1c64f147d3cc9bbafe091a7b335ea3ec700186f0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 30 Sep 2015 10:20:11 +0200 Subject: ieee802154: 6lowpan: add tx/rx stats This patch adds support for increment transmit and receive stats. The meaning of these stats are IPv6 based, which shows the stats after running the 6lowpan adaptation layer (uncompression/compression, fragmentation handling) on receive and before the adaptation layer when transmit. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/rx.c | 2 ++ net/ieee802154/6lowpan/tx.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index b1fd47d..65d55e0 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -29,6 +29,8 @@ static int lowpan_give_skb_to_device(struct sk_buff *skb) { skb->protocol = htons(ETH_P_IPV6); + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; return netif_rx(skb); } diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 3b665e1..5736302 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -192,6 +192,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, } } while (skb_unprocessed > frag_cap); + ldev->stats.tx_packets++; + ldev->stats.tx_bytes += dgram_size; consume_skb(skb); return NET_XMIT_SUCCESS; @@ -277,6 +279,8 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) { skb->dev = lowpan_dev_info(ldev)->wdev; + ldev->stats.tx_packets++; + ldev->stats.tx_bytes += dgram_size; return dev_queue_xmit(skb); } else { netdev_tx_t rc; -- cgit v1.1 From 59fe4606748f7016e9e02bbd26da185f8620661a Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 3 Feb 2012 11:09:23 -0500 Subject: RDS: use kfree_rcu in rds_ib_remove_ipaddr synchronize_rcu() slowing down un-necessarily the socket shutdown path. It is used just kfree() the ip addresses in rds_ib_remove_ipaddr() which is perfect usecase for kfree_rcu(); So lets use that to gain some speedup. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 1 + net/rds/ib_rdma.c | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index aae60fd..f1fd5ffec 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -164,6 +164,7 @@ struct rds_ib_connection { struct rds_ib_ipaddr { struct list_head list; __be32 ipaddr; + struct rcu_head rcu; }; struct rds_ib_device { diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 251d1ce..872f523 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -159,10 +159,8 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) } spin_unlock_irq(&rds_ibdev->spinlock); - if (to_free) { - synchronize_rcu(); - kfree(to_free); - } + if (to_free) + kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) -- cgit v1.1 From 8b0a6b461e2ccc95363e0547aa4f43ba2e02b096 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 3 Feb 2012 11:09:23 -0500 Subject: RDS: make socket bind/release locking scheme simple and more efficient RDS bind and release locking scheme is very inefficient. It uses RCU for maintaining the bind hash-table which is great but it also needs to hold spinlock for [add/remove]_bound(). So overall usecase, the hash-table concurrent speedup doesn't pay off. In fact blocking nature of synchronize_rcu() makes the RDS socket shutdown too slow which hurts RDS performance since connection shutdown and re-connect happens quite often to maintain the RC part of the protocol. So we make the locking scheme simpler and more efficient by replacing spin_locks with reader/writer locks and getting rid off rcu for bind hash-table. In subsequent patch, we also covert the global lock with per-bucket lock to reduce the global lock contention. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/af_rds.c | 6 ------ net/rds/bind.c | 35 +++++++++++++++-------------------- 2 files changed, 15 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index a2f28a6..dc08766 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -72,13 +72,7 @@ static int rds_release(struct socket *sock) rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); - /* - * the binding lookup hash uses rcu, we need to - * make sure we synchronize_rcu before we free our - * entry - */ rds_remove_bound(rs); - synchronize_rcu(); rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); diff --git a/net/rds/bind.c b/net/rds/bind.c index dd666fb..01989e2 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -40,7 +40,7 @@ #define BIND_HASH_SIZE 1024 static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_SPINLOCK(rds_bind_lock); +static DEFINE_RWLOCK(rds_bind_lock); static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) { @@ -48,6 +48,7 @@ static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) (BIND_HASH_SIZE - 1)); } +/* must hold either read or write lock (write lock for insert != NULL) */ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *insert) { @@ -56,30 +57,24 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - rcu_read_lock(); - hlist_for_each_entry_rcu(rs, head, rs_bound_node) { + hlist_for_each_entry(rs, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (cmp == needle) { - rcu_read_unlock(); + if (cmp == needle) return rs; - } } - rcu_read_unlock(); if (insert) { /* * make sure our addr and port are set before - * we are added to the list, other people - * in rcu will find us as soon as the - * hlist_add_head_rcu is done + * we are added to the list. */ insert->rs_bound_addr = addr; insert->rs_bound_port = port; rds_sock_addref(insert); - hlist_add_head_rcu(&insert->rs_bound_node, head); + hlist_add_head(&insert->rs_bound_node, head); } return NULL; } @@ -93,8 +88,11 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; + unsigned long flags; + read_lock_irqsave(&rds_bind_lock, flags); rs = rds_bind_lookup(addr, port, NULL); + read_unlock_irqrestore(&rds_bind_lock, flags); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); @@ -103,6 +101,7 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); + return rs; } @@ -121,7 +120,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); do { if (rover == 0) @@ -135,7 +134,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) } } while (rover++ != last); - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); return ret; } @@ -144,19 +143,19 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - hlist_del_init_rcu(&rs->rs_bound_node); + hlist_del_init(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -200,9 +199,5 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); - - /* we might have called rds_remove_bound on error */ - if (ret) - synchronize_rcu(); return ret; } -- cgit v1.1 From 28126959882d3ec4745f2ec800f3a1d74368b2fe Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 3 Feb 2012 11:08:50 -0500 Subject: RDS: fix rds_sock reference bug while doing bind One need to take rds socket reference while using it and release it once done with it. rds_add_bind() code path does not do that so lets fix it. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/bind.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index 01989e2..166c605 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -61,8 +61,10 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (cmp == needle) + if (cmp == needle) { + rds_sock_addref(rs); return rs; + } } if (insert) { @@ -94,10 +96,10 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) rs = rds_bind_lookup(addr, port, NULL); read_unlock_irqrestore(&rds_bind_lock, flags); - if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) - rds_sock_addref(rs); - else + if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { + rds_sock_put(rs); rs = NULL; + } rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -123,14 +125,18 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) write_lock_irqsave(&rds_bind_lock, flags); do { + struct rds_sock *rrs; if (rover == 0) rover++; - if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + rrs = rds_bind_lookup(addr, cpu_to_be16(rover), rs); + if (!rrs) { *port = rs->rs_bound_port; ret = 0; rdsdebug("rs %p binding to %pI4:%d\n", rs, &addr, (int)ntohs(*port)); break; + } else { + rds_sock_put(rrs); } } while (rover++ != last); -- cgit v1.1 From 9b9acde7e887e057568cd077d9c3377d2cb9aa5b Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 11 Feb 2014 19:34:25 -0800 Subject: RDS: Use per-bucket rw lock for bind hash-table One global lock protecting hash-tables with 1024 buckets isn't efficient and it shows up in a massive systems with truck loads of RDS sockets serving multiple databases. The perf data clearly highlights the contention on the rw lock in these massive workloads. When the contention gets worse, the code gets into a state where it decides to back off on the lock. So while it has disabled interrupts, it sits and backs off on this lock get. This causes the system to become sluggish and eventually all sorts of bad things happen. The simple fix is to move the lock into the hash bucket and use per-bucket lock to improve the scalability. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/af_rds.c | 2 ++ net/rds/bind.c | 47 ++++++++++++++++++++++++++++++++--------------- net/rds/rds.h | 1 + 3 files changed, 35 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index dc08766..384ea1e 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -582,6 +582,8 @@ static int rds_init(void) { int ret; + rds_bind_lock_init(); + ret = rds_conn_init(); if (ret) goto out; diff --git a/net/rds/bind.c b/net/rds/bind.c index 166c605..bc6b93e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -38,22 +38,27 @@ #include #include "rds.h" +struct bind_bucket { + rwlock_t lock; + struct hlist_head head; +}; + #define BIND_HASH_SIZE 1024 -static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_RWLOCK(rds_bind_lock); +static struct bind_bucket bind_hash_table[BIND_HASH_SIZE]; -static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +static struct bind_bucket *hash_to_bucket(__be32 addr, __be16 port) { return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & (BIND_HASH_SIZE - 1)); } /* must hold either read or write lock (write lock for insert != NULL) */ -static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, +static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket, + __be32 addr, __be16 port, struct rds_sock *insert) { struct rds_sock *rs; - struct hlist_head *head = hash_to_bucket(addr, port); + struct hlist_head *head = &bucket->head; u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); @@ -91,10 +96,11 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; unsigned long flags; + struct bind_bucket *bucket = hash_to_bucket(addr, port); - read_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_lookup(addr, port, NULL); - read_unlock_irqrestore(&rds_bind_lock, flags); + read_lock_irqsave(&bucket->lock, flags); + rs = rds_bind_lookup(bucket, addr, port, NULL); + read_unlock_irqrestore(&bucket->lock, flags); if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { rds_sock_put(rs); @@ -113,6 +119,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) unsigned long flags; int ret = -EADDRINUSE; u16 rover, last; + struct bind_bucket *bucket; if (*port != 0) { rover = be16_to_cpu(*port); @@ -122,13 +129,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - write_lock_irqsave(&rds_bind_lock, flags); - do { struct rds_sock *rrs; if (rover == 0) rover++; - rrs = rds_bind_lookup(addr, cpu_to_be16(rover), rs); + + bucket = hash_to_bucket(addr, cpu_to_be16(rover)); + write_lock_irqsave(&bucket->lock, flags); + rrs = rds_bind_lookup(bucket, addr, cpu_to_be16(rover), rs); + write_unlock_irqrestore(&bucket->lock, flags); if (!rrs) { *port = rs->rs_bound_port; ret = 0; @@ -140,16 +149,16 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) } } while (rover++ != last); - write_unlock_irqrestore(&rds_bind_lock, flags); - return ret; } void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; + struct bind_bucket *bucket = + hash_to_bucket(rs->rs_bound_addr, rs->rs_bound_port); - write_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&bucket->lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", @@ -161,7 +170,7 @@ void rds_remove_bound(struct rds_sock *rs) rs->rs_bound_addr = 0; } - write_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&bucket->lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -207,3 +216,11 @@ out: release_sock(sk); return ret; } + +void rds_bind_lock_init(void) +{ + int i; + + for (i = 0; i < BIND_HASH_SIZE; i++) + rwlock_init(&bind_hash_table[i].lock); +} diff --git a/net/rds/rds.h b/net/rds/rds.h index afb4048..121fb81 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -603,6 +603,7 @@ extern wait_queue_head_t rds_poll_waitq; int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(__be32 addr, __be16 port); +void rds_bind_lock_init(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); -- cgit v1.1 From 5f509239eccc9d118d3474a22e78b3da1ceefe02 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 1 Oct 2015 08:03:06 +0200 Subject: ieee802154: handle datagram variables as u16 This reverts commit 9abc378c66e3d6f437eed77c1c534cbc183523f7 ("ieee802154: 6lowpan: change datagram var types"). The reason is that I forgot the IPv6 fragmentation here. Our MTU of lowpan interface is 1280 and skb->len should not above of that. If we reach a payload above 1280 in IPv6 header then we have a IPv6 fragmentation above 802.15.4 6LoWPAN fragmentation. The type "u16" was fine, instead I added now a WARN_ON_ONCE if skb->len is above MTU which should never happen otherwise IPv6 on minimum MTU size is broken. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/6lowpan_i.h | 4 ++-- net/ieee802154/6lowpan/reassembly.c | 2 +- net/ieee802154/6lowpan/tx.c | 10 ++++++---- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index 10d44d0..b4e17a7 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -18,7 +18,7 @@ typedef unsigned __bitwise__ lowpan_rx_result; struct lowpan_create_arg { u16 tag; - unsigned int d_size; + u16 d_size; const struct ieee802154_addr *src; const struct ieee802154_addr *dst; }; @@ -29,7 +29,7 @@ struct lowpan_frag_queue { struct inet_frag_queue q; u16 tag; - unsigned int d_size; + u16 d_size; struct ieee802154_addr saddr; struct ieee802154_addr daddr; }; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index af663cb..12e8cf4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,7 +37,7 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, unsigned int d_size, +static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, const struct ieee802154_addr *saddr, const struct ieee802154_addr *daddr) { diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 5736302..62a21f6 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -137,8 +137,8 @@ lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, static int lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, - const struct ieee802154_hdr *wpan_hdr, - unsigned int dgram_size, unsigned int dgram_offset) + const struct ieee802154_hdr *wpan_hdr, u16 dgram_size, + u16 dgram_offset) { __be16 frag_tag; u8 frag_hdr[5]; @@ -203,7 +203,7 @@ err: } static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, - unsigned int *dgram_size, unsigned int *dgram_offset) + u16 *dgram_size, u16 *dgram_offset) { struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr; struct ieee802154_addr sa, da; @@ -253,10 +253,12 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) { struct ieee802154_hdr wpan_hdr; int max_single, ret; - unsigned int dgram_size, dgram_offset; + u16 dgram_size, dgram_offset; pr_debug("package xmit\n"); + WARN_ON_ONCE(skb->len > IPV6_MIN_MTU); + /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ -- cgit v1.1 From b1842ffddf8941aee4fcd95594bf62d3dc2867cc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 1 Oct 2015 11:41:42 -0500 Subject: ipv6: Add missing newline to __xfrm6_output_finish Add a newline between variable declarations and the code. Signed-off-by: "Eric W. Biederman" --- net/ipv6/xfrm6_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 335066a..4cefda0 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -134,6 +134,7 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; + return x->outer_mode->afinfo->output_finish(sk, skb); } -- cgit v1.1 From aa6555622cdf443f0b001352fdc3afb6e7bce20d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 2 Oct 2015 10:47:29 +0300 Subject: nl802154: Missing return in nl802154_add_llsec_key() There was a missing return here so it meant that often ieee802154_llsec_parse_key_id() was not called. Fixes: a26c5fd7622d ('nl802154: add support for security layer') Signed-off-by: Dan Carpenter Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 1e9e865..16ef0d9 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1534,6 +1534,7 @@ static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || !attrs[NL802154_KEY_ATTR_BYTES]) + return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; -- cgit v1.1 From 586c2b573ee4c2c4ba03e16318a16614ebf876f8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 2 Oct 2015 15:05:10 +0200 Subject: bridge: vlan: use rcu list for the ordered vlan list When I did the conversion to rhashtable I missed the required locking of one important user of the vlan list - br_get_link_af_size_filtered() which is called: br_ifinfo_notify() -> br_nlmsg_size() -> br_get_link_af_size_filtered() and the notifications can be sent without holding rtnl. Before this conversion the function relied on using rcu and since we already use rcu to destroy the vlans, we can simply migrate the list to use the rcu helpers. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 10 ++++++++-- net/bridge/br_vlan.c | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c64dcad..c318619 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -34,7 +34,7 @@ static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, pvid = br_get_pvid(vg); /* Count number of vlan infos */ - list_for_each_entry(v, &vg->vlan_list, vlist) { + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) @@ -76,13 +76,19 @@ initvars: static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { + int num_vlans; + if (!vg) return 0; if (filter_mask & RTEXT_FILTER_BRVLAN) return vg->num_vlans; - return __get_num_vlan_infos(vg, filter_mask); + rcu_read_lock(); + num_vlans = __get_num_vlan_infos(vg, filter_mask); + rcu_read_unlock(); + + return num_vlans; } static size_t br_get_link_af_size_filtered(const struct net_device *dev, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 1a79e19..d97a55e 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -111,12 +111,12 @@ static void __vlan_add_list(struct net_bridge_vlan *v) else break; } - list_add(&v->vlist, hpos); + list_add_rcu(&v->vlist, hpos); } static void __vlan_del_list(struct net_bridge_vlan *v) { - list_del(&v->vlist); + list_del_rcu(&v->vlist); } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, -- cgit v1.1 From f8ed289fab843fbc9251aa2f5c3d416f09b5fc7e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 2 Oct 2015 15:05:11 +0200 Subject: bridge: vlan: use br_vlan_(get|put)_master to deal with refcounts Introduce br_vlan_(get|put)_master which take a reference (or create the master vlan first if it didn't exist) and drop a reference respectively. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 56 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index d97a55e..6e41fba 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -146,6 +146,40 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, return err; } +/* Returns a master vlan, if it didn't exist it gets created. In all cases a + * a reference is taken to the master vlan before returning. + */ +static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid) +{ + struct net_bridge_vlan *masterv; + + masterv = br_vlan_find(br->vlgrp, vid); + if (!masterv) { + /* missing global ctx, create it now */ + if (br_vlan_add(br, vid, 0)) + return NULL; + masterv = br_vlan_find(br->vlgrp, vid); + if (WARN_ON(!masterv)) + return NULL; + } + atomic_inc(&masterv->refcnt); + + return masterv; +} + +static void br_vlan_put_master(struct net_bridge_vlan *masterv) +{ + if (!br_vlan_is_master(masterv)) + return; + + if (atomic_dec_and_test(&masterv->refcnt)) { + rhashtable_remove_fast(&masterv->br->vlgrp->vlan_hash, + &masterv->vnode, br_vlan_rht_params); + __vlan_del_list(masterv); + kfree_rcu(masterv, rcu); + } +} + /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: @@ -196,16 +230,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) goto out_filt; } - masterv = br_vlan_find(br->vlgrp, v->vid); - if (!masterv) { - /* missing global ctx, create it now */ - err = br_vlan_add(br, v->vid, 0); - if (err) - goto out_filt; - masterv = br_vlan_find(br->vlgrp, v->vid); - WARN_ON(!masterv); - } - atomic_inc(&masterv->refcnt); + masterv = br_vlan_get_master(br, v->vid); + if (!masterv) + goto out_filt; v->brvlan = masterv; } @@ -240,7 +267,7 @@ out_filt: if (p) { __vlan_vid_del(dev, br, v->vid); if (masterv) { - atomic_dec(&masterv->refcnt); + br_vlan_put_master(masterv); v->brvlan = NULL; } } @@ -289,12 +316,7 @@ static int __vlan_del(struct net_bridge_vlan *v) kfree_rcu(v, rcu); } - if (atomic_dec_and_test(&masterv->refcnt)) { - rhashtable_remove_fast(&masterv->br->vlgrp->vlan_hash, - &masterv->vnode, br_vlan_rht_params); - __vlan_del_list(masterv); - kfree_rcu(masterv, rcu); - } + br_vlan_put_master(masterv); out: return err; } -- cgit v1.1 From 2ffdf508d278d48ccb928238846df352db21f4eb Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 2 Oct 2015 15:05:12 +0200 Subject: bridge: vlan: drop master_flags from __vlan_add There's only one user now and we can include the flag directly. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6e41fba..2c1fdf9 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -212,8 +212,6 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) } if (p) { - u16 master_flags = flags; - /* Add VLAN to the device filter if it is supported. * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). @@ -224,8 +222,8 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { - master_flags |= BRIDGE_VLAN_INFO_BRENTRY; - err = br_vlan_add(br, v->vid, master_flags); + err = br_vlan_add(br, v->vid, flags | + BRIDGE_VLAN_INFO_BRENTRY); if (err) goto out_filt; } -- cgit v1.1 From 6be144f62f64c8a67e11b2f8b86c7bf390b87411 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 2 Oct 2015 15:05:13 +0200 Subject: bridge: vlan: use br_vlan_should_use to simplify __vlan_add/del The checks that lead to num_vlans change are always what br_vlan_should_use checks for, namely if the vlan is only a context or not and depending on that it's either not counted or counted as a real/used vlan respectively. Also give better explanation in br_vlan_should_use's comment. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 2 +- net/bridge/br_vlan.c | 36 ++++++++++++++---------------------- 2 files changed, 15 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4ed8308..1ff6a0f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -400,7 +400,7 @@ static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v) return v->flags & BRIDGE_VLAN_INFO_BRENTRY; } -/* check if we should use the vlan entry is usable */ +/* check if we should use the vlan entry, returns false if it's only context */ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) { if (br_vlan_is_master(v)) { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 2c1fdf9..b879111 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -195,7 +195,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; - struct rhashtable *tbl; + struct net_bridge_vlan_group *vg; struct net_device *dev; struct net_bridge *br; int err; @@ -203,12 +203,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; - tbl = &br->vlgrp->vlan_hash; + vg = br->vlgrp; } else { p = v->port; br = p->br; dev = p->dev; - tbl = &p->vlgrp->vlan_hash; + vg = p->vlgrp; } if (p) { @@ -234,32 +234,31 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) v->brvlan = masterv; } - /* Add the dev mac only if it's a usable vlan */ + /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { err = br_fdb_insert(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; } + vg->num_vlans++; } - err = rhashtable_lookup_insert_fast(tbl, &v->vnode, br_vlan_rht_params); + err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, + br_vlan_rht_params); if (err) goto out_fdb_insert; __vlan_add_list(v); __vlan_add_flags(v, flags); - if (br_vlan_is_master(v)) { - if (br_vlan_is_brentry(v)) - br->vlgrp->num_vlans++; - } else { - p->vlgrp->num_vlans++; - } out: return err; out_fdb_insert: - br_fdb_find_delete_local(br, p, br->dev->dev_addr, v->vid); + if (br_vlan_should_use(v)) { + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); + vg->num_vlans--; + } out_filt: if (p) { @@ -278,15 +277,12 @@ static int __vlan_del(struct net_bridge_vlan *v) struct net_bridge_vlan *masterv = v; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; - struct net_bridge *br; int err = 0; if (br_vlan_is_master(v)) { - br = v->br; vg = v->br->vlgrp; } else { p = v->port; - br = p->br; vg = v->port->vlgrp; masterv = v->brvlan; } @@ -298,13 +294,9 @@ static int __vlan_del(struct net_bridge_vlan *v) goto out; } - if (br_vlan_is_master(v)) { - if (br_vlan_is_brentry(v)) { - v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; - br->vlgrp->num_vlans--; - } - } else { - p->vlgrp->num_vlans--; + if (br_vlan_should_use(v)) { + v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; + vg->num_vlans--; } if (masterv != v) { -- cgit v1.1 From 7910228b6bb35f3c8e0bc72a8d84c29616cb1b90 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:28 +0200 Subject: bridge: netlink: add group_fwd_mask support Add IFLA_BR_GROUP_FWD_MASK attribute to allow setting and retrieving the group_fwd_mask via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c318619..39b201a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -764,6 +764,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, + [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -829,6 +830,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } #endif + if (data[IFLA_BR_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_RESTRICTED) + return -EINVAL; + br->group_fwd_mask = fwd_mask; + } + return 0; } @@ -844,6 +853,7 @@ static size_t br_get_size(const struct net_device *brdev) #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ #endif + nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ 0; } @@ -856,6 +866,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; + u16 group_fwd_mask = br->group_fwd_mask; u8 vlan_enabled = br_vlan_enabled(br); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || @@ -864,7 +875,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || - nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled)) + nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || + nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From 5127c81f84de0dd643d5840a2c7de571bc6aceb3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:29 +0200 Subject: bridge: netlink: export root id Add IFLA_BR_ROOT_ID and export br->designated_root via netlink. For this purpose add struct ifla_bridge_id that would represent struct bridge_id. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 39b201a..7a36924 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -854,6 +854,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ + nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ 0; } @@ -868,6 +869,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u16 group_fwd_mask = br->group_fwd_mask; u8 vlan_enabled = br_vlan_enabled(br); + struct ifla_bridge_id root_id; + + memset(&root_id, 0, sizeof(root_id)); + memcpy(root_id.prio, br->designated_root.prio, sizeof(root_id.prio)); + memcpy(root_id.addr, br->designated_root.addr, sizeof(root_id.addr)); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || @@ -876,7 +882,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || - nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask)) + nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask) || + nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From 7599a2201fc71cdca16a92d350f14cce8730e03f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:30 +0200 Subject: bridge: netlink: export bridge id Add IFLA_BR_BRIDGE_ID and export br->bridge_id via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 7a36924..a63f944 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -855,6 +855,7 @@ static size_t br_get_size(const struct net_device *brdev) #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ 0; } @@ -869,11 +870,14 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u16 group_fwd_mask = br->group_fwd_mask; u8 vlan_enabled = br_vlan_enabled(br); - struct ifla_bridge_id root_id; + struct ifla_bridge_id root_id, bridge_id; + memset(&bridge_id, 0, sizeof(bridge_id)); memset(&root_id, 0, sizeof(root_id)); memcpy(root_id.prio, br->designated_root.prio, sizeof(root_id.prio)); memcpy(root_id.addr, br->designated_root.addr, sizeof(root_id.addr)); + memcpy(bridge_id.prio, br->bridge_id.prio, sizeof(bridge_id.prio)); + memcpy(bridge_id.addr, br->bridge_id.addr, sizeof(bridge_id.addr)); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || @@ -883,7 +887,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask) || - nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id)) + nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id) || + nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(bridge_id), &bridge_id)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From 8762ba680fe8d41b444fc92f90ce7194b2b8303b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:31 +0200 Subject: bridge: netlink: export root port Add IFLA_BR_ROOT_PORT and export it via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index a63f944..652db1c 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -856,6 +856,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ 0; } @@ -888,7 +889,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask) || nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id) || - nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(bridge_id), &bridge_id)) + nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(bridge_id), &bridge_id) || + nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From 684dd248bee8c73eadb90706123bf1494d3218b8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:32 +0200 Subject: bridge: netlink: export root path cost Add IFLA_BR_ROOT_PATH_COST and export it via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 652db1c..cd0488b 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -857,6 +857,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ 0; } @@ -890,7 +891,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask) || nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id) || nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(bridge_id), &bridge_id) || - nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port)) + nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || + nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From ed4163098e3090bb7b51421bde977e355275a554 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:33 +0200 Subject: bridge: netlink: export topology_change and topology_change_detected Add IFLA_BR_TOPOLOGY_CHANGE and IFLA_BR_TOPOLOGY_CHANGE_DETECTED and export them via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index cd0488b..8bcaa51 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -858,6 +858,8 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ 0; } @@ -892,7 +894,10 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id) || nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(bridge_id), &bridge_id) || nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || - nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost)) + nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || + nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || + nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + br->topology_change_detected)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From d76bd14e0f759040efc8ce142dd6d1f9eca33d39 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:34 +0200 Subject: bridge: netlink: export all timers Export the following bridge timers (also exported via sysfs): IFLA_BR_HELLO_TIMER, IFLA_BR_TCN_TIMER, IFLA_BR_TOPOLOGY_CHANGE_TIMER, IFLA_BR_GC_TIMER via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8bcaa51..755bfe0 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -860,12 +860,17 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ 0; } static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); + u64 hello_timer, tcn_timer, topology_change_timer, gc_timer; u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); @@ -882,6 +887,10 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) memcpy(root_id.addr, br->designated_root.addr, sizeof(root_id.addr)); memcpy(bridge_id.prio, br->bridge_id.prio, sizeof(bridge_id.prio)); memcpy(bridge_id.addr, br->bridge_id.addr, sizeof(bridge_id.addr)); + hello_timer = br_timer_value(&br->hello_timer); + tcn_timer = br_timer_value(&br->tcn_timer); + topology_change_timer = br_timer_value(&br->topology_change_timer); + gc_timer = br_timer_value(&br->gc_timer); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || @@ -897,7 +906,12 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, - br->topology_change_detected)) + br->topology_change_detected) || + nla_put_u64(skb, IFLA_BR_HELLO_TIMER, hello_timer) || + nla_put_u64(skb, IFLA_BR_TCN_TIMER, tcn_timer) || + nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, + topology_change_timer) || + nla_put_u64(skb, IFLA_BR_GC_TIMER, gc_timer)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From 111189abc5c3f0ea6f516a6c3e8d8c3a2cf391d9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:35 +0200 Subject: bridge: netlink: add group_addr support Add IFLA_BR_GROUP_ADDR attribute to allow setting and retrieving the group_addr via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 755bfe0..a05a430 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -765,6 +765,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, + [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, + .len = ETH_ALEN }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -838,6 +840,25 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->group_fwd_mask = fwd_mask; } + if (data[IFLA_BR_GROUP_ADDR]) { + u8 new_addr[ETH_ALEN]; + + if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) + return -EINVAL; + memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); + if (!is_link_local_ether_addr(new_addr)) + return -EINVAL; + if (new_addr[5] == 1 || /* 802.3x Pause address */ + new_addr[5] == 2 || /* 802.3ad Slow protocols */ + new_addr[5] == 3) /* 802.1X PAE address */ + return -EINVAL; + spin_lock_bh(&br->lock); + memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); + spin_unlock_bh(&br->lock); + br->group_addr_set = true; + br_recalculate_fwd_mask(br); + } + return 0; } @@ -864,6 +885,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ + nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ 0; } @@ -911,7 +933,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u64(skb, IFLA_BR_TCN_TIMER, tcn_timer) || nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, topology_change_timer) || - nla_put_u64(skb, IFLA_BR_GC_TIMER, gc_timer)) + nla_put_u64(skb, IFLA_BR_GC_TIMER, gc_timer) || + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.1 From 150217c688217e549ef8a36ea4f6718977373765 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:36 +0200 Subject: bridge: netlink: add fdb flush Simple attribute that flushes the bridge's fdb. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index a05a430..5853c57 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -859,6 +859,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br_recalculate_fwd_mask(br); } + if (data[IFLA_BR_FDB_FLUSH]) + br_fdb_flush(br); + return 0; } -- cgit v1.1 From a9a6bc70f5f70b3835b081e401b469b88c7c8a3a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:37 +0200 Subject: bridge: netlink: add support for multicast_router Add IFLA_BR_MCAST_ROUTER to allow setting and retrieving br->multicast_router when igmp snooping is enabled. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5853c57..f4df609 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -767,6 +767,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, + [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -862,6 +863,16 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_FDB_FLUSH]) br_fdb_flush(br); +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (data[IFLA_BR_MCAST_ROUTER]) { + u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); + + err = br_multicast_set_router(br, multicast_router); + if (err) + return err; + } +#endif + return 0; } @@ -889,6 +900,9 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ +#endif 0; } @@ -945,6 +959,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return -EMSGSIZE; #endif +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router)) + return -EMSGSIZE; +#endif + return 0; } -- cgit v1.1 From 89126327f921bd278c72284d38428443bbef344f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:38 +0200 Subject: bridge: netlink: add support for multicast_snooping Add IFLA_BR_MCAST_SNOOPING to allow enabling/disabling multicast snooping via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f4df609..25e1c66 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -768,6 +768,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -871,6 +872,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_MCAST_SNOOPING]) { + u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); + + err = br_multicast_toggle(br, mcast_snooping); + if (err) + return err; + } #endif return 0; @@ -902,6 +911,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ #endif 0; } @@ -960,7 +970,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router)) + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || + nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled)) return -EMSGSIZE; #endif -- cgit v1.1 From 295141d9049bdf4fa316b325d2e2501b210dbe06 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:39 +0200 Subject: bridge: netlink: add support for multicast_query_use_ifaddr Add IFLA_BR_MCAST_QUERY_USE_IFADDR to allow setting/getting br->multicast_query_use_ifaddr via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 25e1c66..12ef844 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -769,6 +769,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { .len = ETH_ALEN }, [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -880,6 +881,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { + u8 val; + + val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); + br->multicast_query_use_ifaddr = !!val; + } #endif return 0; @@ -912,6 +920,7 @@ static size_t br_get_size(const struct net_device *brdev) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ #endif 0; } @@ -971,7 +980,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || - nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled)) + nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, + br->multicast_query_use_ifaddr)) return -EMSGSIZE; #endif -- cgit v1.1 From ba062d7cc6a09a8194eba975d5ee635378a55bfc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:40 +0200 Subject: bridge: netlink: add support for multicast_querier Add IFLA_BR_MCAST_QUERIER to allow setting/getting br->multicast_querier via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 12ef844..e21296d 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -770,6 +770,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -888,6 +889,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); br->multicast_query_use_ifaddr = !!val; } + + if (data[IFLA_BR_MCAST_QUERIER]) { + u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); + + err = br_multicast_set_querier(br, mcast_querier); + if (err) + return err; + } #endif return 0; @@ -921,6 +930,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ #endif 0; } @@ -982,7 +992,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, - br->multicast_query_use_ifaddr)) + br->multicast_query_use_ifaddr) || + nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier)) return -EMSGSIZE; #endif -- cgit v1.1 From 431db3c050af0be72b3b01fa7484982f35cb268f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:41 +0200 Subject: bridge: netlink: add support for igmp's hash_elasticity Add IFLA_BR_MCAST_HASH_ELASTICITY to allow setting/getting br->hash_elasticity via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e21296d..b210a63 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -771,6 +771,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -897,6 +898,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]); + + br->hash_elasticity = val; + } #endif return 0; @@ -931,6 +938,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ #endif 0; } @@ -993,7 +1001,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br->multicast_query_use_ifaddr) || - nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier)) + nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, + br->hash_elasticity)) return -EMSGSIZE; #endif -- cgit v1.1 From 858079fdae16421d4908722140346cfdddedf343 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:42 +0200 Subject: bridge: netlink: add support for igmp's hash_max Add IFLA_BR_MCAST_HASH_MAX to allow setting/getting br->hash_max via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index b210a63..d6b61b0 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -772,6 +772,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -904,6 +905,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->hash_elasticity = val; } + + if (data[IFLA_BR_MCAST_HASH_MAX]) { + u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); + + err = br_multicast_set_hash_max(br, hash_max); + if (err) + return err; + } #endif return 0; @@ -939,6 +948,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ #endif 0; } @@ -1003,7 +1013,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) br->multicast_query_use_ifaddr) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, - br->hash_elasticity)) + br->hash_elasticity) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max)) return -EMSGSIZE; #endif -- cgit v1.1 From 79b859f573d6afa64e328cc7f50ad7a209e0c92d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:43 +0200 Subject: bridge: netlink: add support for multicast_last_member_count Add IFLA_BR_MCAST_LAST_MEMBER_CNT to allow setting/getting br->multicast_last_member_count via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d6b61b0..cf6ccae 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -773,6 +773,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -913,6 +914,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); + + br->multicast_last_member_count = val; + } #endif return 0; @@ -949,6 +956,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ #endif 0; } @@ -1014,7 +1022,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, br->hash_elasticity) || - nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max)) + nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || + nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, + br->multicast_last_member_count)) return -EMSGSIZE; #endif -- cgit v1.1 From b89e6babad4b7ca7298ad863c6c83dc76b0abdef Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:44 +0200 Subject: bridge: netlink: add support for multicast_startup_query_count Add IFLA_BR_MCAST_STARTUP_QUERY_CNT to allow setting/getting br->multicast_startup_query_count via netlink. Also align the ifla comments. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index cf6ccae..6744e30 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -774,6 +774,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -920,6 +921,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->multicast_last_member_count = val; } + + if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { + u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); + + br->multicast_startup_query_count = val; + } #endif return 0; @@ -942,8 +949,8 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ - nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ - nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ @@ -954,9 +961,10 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ - nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ - nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ - nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ #endif 0; } @@ -1024,7 +1032,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) br->hash_elasticity) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, - br->multicast_last_member_count)) + br->multicast_last_member_count) || + nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, + br->multicast_startup_query_count)) return -EMSGSIZE; #endif -- cgit v1.1 From 7e4df51eb35deedd3ba8d4db92a6c36fb7eff90a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:45 +0200 Subject: bridge: netlink: add support for igmp's intervals Add support to set/get all of the igmp's configurable intervals via netlink. These currently are: IFLA_BR_MCAST_LAST_MEMBER_INTVL IFLA_BR_MCAST_MEMBERSHIP_INTVL IFLA_BR_MCAST_QUERIER_INTVL IFLA_BR_MCAST_QUERY_INTVL IFLA_BR_MCAST_QUERY_RESPONSE_INTVL IFLA_BR_MCAST_STARTUP_QUERY_INTVL Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6744e30..30def4f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -775,6 +775,12 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, + [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -927,6 +933,42 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->multicast_startup_query_count = val; } + + if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); + + br->multicast_last_member_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); + + br->multicast_membership_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); + + br->multicast_querier_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERY_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); + + br->multicast_query_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); + + br->multicast_query_response_interval = clock_t_to_jiffies(val); + } + + if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { + u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); + + br->multicast_startup_query_interval = clock_t_to_jiffies(val); + } #endif return 0; @@ -965,6 +1007,12 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ + nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ #endif 0; } @@ -972,7 +1020,7 @@ static size_t br_get_size(const struct net_device *brdev) static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); - u64 hello_timer, tcn_timer, topology_change_timer, gc_timer; + u64 hello_timer, tcn_timer, topology_change_timer, gc_timer, clockval; u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); @@ -993,6 +1041,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) tcn_timer = br_timer_value(&br->tcn_timer); topology_change_timer = br_timer_value(&br->topology_change_timer); gc_timer = br_timer_value(&br->gc_timer); + clockval = 0; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || @@ -1036,6 +1085,25 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, br->multicast_startup_query_count)) return -EMSGSIZE; + + clockval = jiffies_to_clock_t(br->multicast_last_member_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_membership_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_querier_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_query_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_query_response_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval)) + return -EMSGSIZE; + clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); + if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval)) + return -EMSGSIZE; #endif return 0; -- cgit v1.1 From 93870cc02a0af4392401713d14235accafc752bc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:46 +0200 Subject: bridge: netlink: add support for netfilter tables config Add support to allow getting/setting netfilter tables settings. Currently these are IFLA_BR_NF_CALL_IPTABLES, IFLA_BR_NF_CALL_IP6TABLES and IFLA_BR_NF_CALL_ARPTABLES. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 30def4f..fd37caf 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -781,6 +781,9 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, + [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, + [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, + [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -970,6 +973,25 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->multicast_startup_query_interval = clock_t_to_jiffies(val); } #endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (data[IFLA_BR_NF_CALL_IPTABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); + + br->nf_call_iptables = val ? true : false; + } + + if (data[IFLA_BR_NF_CALL_IP6TABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); + + br->nf_call_ip6tables = val ? true : false; + } + + if (data[IFLA_BR_NF_CALL_ARPTABLES]) { + u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); + + br->nf_call_arptables = val ? true : false; + } +#endif return 0; } @@ -1014,6 +1036,11 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ #endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ +#endif 0; } @@ -1070,7 +1097,6 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto)) return -EMSGSIZE; #endif - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) || @@ -1105,6 +1131,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval)) return -EMSGSIZE; #endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, + br->nf_call_iptables ? 1 : 0) || + nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, + br->nf_call_ip6tables ? 1 : 0) || + nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, + br->nf_call_arptables ? 1 : 0)) + return -EMSGSIZE; +#endif return 0; } -- cgit v1.1 From 0f963b7592ef9e054974b6672b86ec1edd84b4bc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 4 Oct 2015 14:23:47 +0200 Subject: bridge: netlink: add support for default_pvid Add IFLA_BR_VLAN_DEFAULT_PVID to allow setting/getting bridge's default_pvid via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 13 ++++++++++++- net/bridge/br_private.h | 1 + net/bridge/br_vlan.c | 14 +++++++------- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index fd37caf..70efe2e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -784,6 +784,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -847,6 +848,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + err = __br_vlan_set_default_pvid(br, defpvid); + if (err) + return err; + } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { @@ -1007,6 +1016,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ @@ -1094,7 +1104,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING - if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto)) + if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || + nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid)) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1ff6a0f..09d3ecb 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -690,6 +690,7 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index b879111..eae07ee 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -727,7 +727,7 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) br->default_pvid = 0; } -static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) { const struct net_bridge_vlan *pvent; struct net_bridge_port *p; @@ -735,6 +735,11 @@ static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) int err = 0; unsigned long *changed; + if (!pvid) { + br_vlan_disable_default_pvid(br); + return 0; + } + changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), GFP_KERNEL); if (!changed) @@ -825,12 +830,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto unlock; } - - if (!pvid) - br_vlan_disable_default_pvid(br); - else - err = __br_vlan_set_default_pvid(br, pvid); - + err = __br_vlan_set_default_pvid(br, pvid); unlock: rtnl_unlock(); return err; -- cgit v1.1 From ed1b28a48b6c4e206bd88f5758393261710566f2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 4 Oct 2015 23:33:59 +0200 Subject: Bluetooth: Limit userspace exposure of stack internal events The stack internal events that are exposed to userspace should be limited to HCI_DEV_REG, HCI_DEV_UNREG, HCI_DEV_UP and HCI_DEV_DOWN. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1505563..d9ad684 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -392,14 +392,12 @@ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) void hci_sock_dev_event(struct hci_dev *hdev, int event) { - struct hci_ev_si_device ev; - BT_DBG("hdev %s event %d", hdev->name, event); - /* Send event to monitor */ if (atomic_read(&monitor_promisc)) { struct sk_buff *skb; + /* Send event to monitor */ skb = create_monitor_event(hdev, event); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, @@ -408,10 +406,14 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) } } - /* Send event to sockets */ - ev.event = event; - ev.dev_id = hdev->id; - hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + if (event <= HCI_DEV_DOWN) { + struct hci_ev_si_device ev; + + /* Send event to sockets */ + ev.event = event; + ev.dev_id = hdev->id; + hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + } if (event == HCI_DEV_UNREG) { struct sock *sk; -- cgit v1.1 From 4a3f95b7b62e50a1e42e42ba6571ec9e747f4861 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 4 Oct 2015 23:34:00 +0200 Subject: Bluetooth: Introduce HCI_DEV_OPEN and HCI_DEV_CLOSE events When opening the HCI transport via hdev->open send HCI_DEV_OPEN event and when closing the HCI transport via hdev->close send HCI_DEV_CLOSE. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 7935646..5af33c8 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1385,6 +1385,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) goto done; } + hci_notify(hdev, HCI_DEV_OPEN); + atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); @@ -1466,6 +1468,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + hci_notify(hdev, HCI_DEV_CLOSE); + hdev->close(hdev); hdev->flags &= BIT(HCI_RAW); } @@ -1649,6 +1653,8 @@ int hci_dev_do_close(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + hci_notify(hdev, HCI_DEV_CLOSE); + /* After this point our queues are empty * and no tasks are scheduled. */ hdev->close(hdev); -- cgit v1.1 From 73d0d3c8671190ea982a8e79a7c79fbfe88f8f47 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 4 Oct 2015 23:34:01 +0200 Subject: Bluetooth: Move HCI_RUNNING check into hci_send_frame In all callbacks for hdev->send the status of HCI_RUNNING is checked. So instead of repeating that code in every driver, move the check into the hci_send_frame function before calling hdev->send. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5af33c8..b955f71 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3536,6 +3536,11 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); + if (!test_bit(HCI_RUNNING, &hdev->flags)) { + kfree_skb(skb); + return; + } + err = hdev->send(hdev, skb); if (err < 0) { BT_ERR("%s sending frame failed (%d)", hdev->name, err); -- cgit v1.1 From e9ca8bf157f2b45f8f670517c96da313083ee9b2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 4 Oct 2015 23:34:02 +0200 Subject: Bluetooth: Move handling of HCI_RUNNING flag into core Setting and clearing of HCI_RUNNING flag in each and every driver is just duplicating the same code all over the place. So instead of having the driver do it in their hdev->open and hdev->close callbacks, set it globally in the core transport handling. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b955f71..40a6701 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1385,6 +1385,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) goto done; } + set_bit(HCI_RUNNING, &hdev->flags); hci_notify(hdev, HCI_DEV_OPEN); atomic_set(&hdev->cmd_cnt, 1); @@ -1468,6 +1469,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + clear_bit(HCI_RUNNING, &hdev->flags); hci_notify(hdev, HCI_DEV_CLOSE); hdev->close(hdev); @@ -1653,6 +1655,7 @@ int hci_dev_do_close(struct hci_dev *hdev) hdev->sent_cmd = NULL; } + clear_bit(HCI_RUNNING, &hdev->flags); hci_notify(hdev, HCI_DEV_CLOSE); /* After this point our queues are empty -- cgit v1.1 From 22db3cbcf9f91eef848db0986869822b4bf27193 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 4 Oct 2015 23:34:03 +0200 Subject: Bluetooth: Send transport open and close monitor events When the core starts or shuts down the actual HCI transport, send a new monitor event that indicates that this is happening. These new events correspond to HCI_DEV_OPEN and HCI_DEV_CLOSE events. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d9ad684..64ebe84 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -329,6 +329,22 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; + case HCI_DEV_OPEN: + skb = bt_skb_alloc(0, GFP_ATOMIC); + if (!skb) + return NULL; + + opcode = cpu_to_le16(HCI_MON_OPEN_INDEX); + break; + + case HCI_DEV_CLOSE: + skb = bt_skb_alloc(0, GFP_ATOMIC); + if (!skb) + return NULL; + + opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX); + break; + default: return NULL; } @@ -358,6 +374,16 @@ static void send_monitor_replay(struct sock *sk) if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); + + if (!test_bit(HCI_RUNNING, &hdev->flags)) + continue; + + skb = create_monitor_event(hdev, HCI_DEV_OPEN); + if (!skb) + continue; + + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); } read_unlock(&hci_dev_list_lock); -- cgit v1.1 From 7656d842de93fd2d2de7b403062cad757cadf1df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Oct 2015 21:08:07 -0700 Subject: tcp: fix fastopen races vs lockless listener There are multiple races that need fixes : 1) skb_get() + queue skb + kfree_skb() is racy An accept() can be done on another cpu, data consumed immediately. tcp_recvmsg() uses __kfree_skb() as it is assumed all skb found in socket receive queue are private. Then the kfree_skb() in tcp_rcv_state_process() uses an already freed skb 2) tcp_reqsk_record_syn() needs to be done before tcp_try_fastopen() for the same reasons. 3) We want to send the SYNACK before queueing child into accept queue, otherwise we might reintroduce the ooo issue fixed in commit 7c85af881044 ("tcp: avoid reorders for TFO passive connections") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 26 +++++++------------------- net/ipv4/tcp_input.c | 6 +++++- 2 files changed, 12 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 410ac48..93396bf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -168,8 +168,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, TCP_TIMEOUT_INIT, TCP_RTO_MAX); atomic_set(&req->rsk_refcnt, 2); - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); @@ -178,12 +176,10 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * Note that IPv6 might also have used skb_get() trick - * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) - * So we need to eventually get a clone of the packet, - * before inserting it in sk_receive_queue. + /* Queue the data carried in the SYN packet. + * We used to play tricky games with skb_get(). + * With lockless listener, it is a dead end. + * Do not think about it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since @@ -191,12 +187,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, */ end_seq = TCP_SKB_CB(skb)->end_seq; if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2; - - if (unlikely(skb_shared(skb))) - skb2 = skb_clone(skb, GFP_ATOMIC); - else - skb2 = skb_get(skb); + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (likely(skb2)) { skb_dst_drop(skb2); @@ -214,12 +205,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, } } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; - sk->sk_data_ready(sk); - bh_unlock_sock(child); - /* Note: sock_put(child) will be done by tcp_conn_request() - * after SYNACK packet is sent. + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. */ - WARN_ON(!req->sk); return child; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2710875..a95c8eb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6229,12 +6229,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); if (!want_cookie) { - fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); tcp_reqsk_record_syn(sk, req, skb); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, skb_get_queue_mapping(skb), &foc, false); + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, fastopen_sk); + sk->sk_data_ready(sk); + bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; -- cgit v1.1 From 004a5d0140ce1d05c1f5fce5df4baa2717a330e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Oct 2015 21:08:10 -0700 Subject: net: use sk_fullsock() in __netdev_pick_tx() SYN_RECV & TIMEWAIT sockets are not full blown, they do not have a sk_dst_cache pointer. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 323c04e..a229bf0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2974,6 +2974,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) new_index = skb_tx_hash(dev, skb); if (queue_index != new_index && sk && + sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); -- cgit v1.1 From a1a5344ddbe8fd3e080013b317ac9a664490cfdf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Oct 2015 21:08:11 -0700 Subject: tcp: avoid two atomic ops for syncookies inet_reqsk_alloc() is used to allocate a temporary request in order to generate a SYNACK with a cookie. Then later, syncookie validation also uses a temporary request. These paths already took a reference on listener refcount, we can avoid a couple of atomic operations. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_input.c | 8 +++++--- net/ipv6/syncookies.c | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8910c95..8e99681 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -595,7 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); if (req == NULL) goto drop; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1361a3f..aed314f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -319,7 +319,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); if (req == NULL) goto drop; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 729ceb5..8113c30 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -326,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a95c8eb..ddadb31 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6042,9 +6042,11 @@ static void tcp_openreq_init(struct request_sock *req, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener) + struct sock *sk_listener, + bool attach_listener) { - struct request_sock *req = reqsk_alloc(ops, sk_listener); + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); @@ -6143,7 +6145,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops, sk); + req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7606eba..f610b53 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -170,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); if (!req) goto out; -- cgit v1.1 From 0e884c78ee19e902f300ed147083c28a0c6302f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20N=C3=B8rlund?= Date: Wed, 30 Sep 2015 10:12:21 +0200 Subject: ipv4: L3 hash-based multipath MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the per-packet multipath with a hash-based multipath using source and destination address. Signed-off-by: Peter Nørlund Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 140 +++++++++++++++++++++++++---------------------- net/ipv4/route.c | 16 ++++-- 2 files changed, 87 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 064bd3c..0c49d2f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -57,8 +57,7 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH - -static DEFINE_SPINLOCK(fib_multipath_lock); +u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -532,7 +531,67 @@ errout: return ret; } -#endif +static void fib_rebalance(struct fib_info *fi) +{ + int total; + int w; + struct in_device *in_dev; + + if (fi->fib_nhs < 2) + return; + + total = 0; + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + + in_dev = __in_dev_get_rcu(nh->nh_dev); + + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN) + continue; + + total += nh->nh_weight; + } endfor_nexthops(fi); + + w = 0; + change_nexthops(fi) { + int upper_bound; + + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + upper_bound = -1; + } else if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + upper_bound = -1; + } else { + w += nexthop_nh->nh_weight; + upper_bound = DIV_ROUND_CLOSEST(2147483648LL * w, + total) - 1; + } + + atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + } endfor_nexthops(fi); + + net_get_random_once(&fib_multipath_secret, + sizeof(fib_multipath_secret)); +} + +static inline void fib_add_weight(struct fib_info *fi, + const struct fib_nh *nh) +{ + fi->fib_weight += nh->nh_weight; +} + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define fib_rebalance(fi) do { } while (0) +#define fib_add_weight(fi, nh) do { } while (0) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, @@ -1094,8 +1153,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) + fib_rebalance(fi); + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -1317,12 +1379,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; break; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nexthop_nh->nh_power; - nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); -#endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1345,6 +1401,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) } ret++; } + + fib_rebalance(fi); } return ret; @@ -1467,20 +1525,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) !__in_dev_get_rtnl(dev)) continue; alive++; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~nh_flags; - spin_unlock_bh(&fib_multipath_lock); -#else nexthop_nh->nh_flags &= ~nh_flags; -#endif } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } + + fib_rebalance(fi); } return ret; @@ -1488,62 +1541,19 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) #ifdef CONFIG_IP_ROUTE_MULTIPATH -/* - * The algorithm is suboptimal, but it provides really - * fair weighted route distribution. - */ -void fib_select_multipath(struct fib_result *res) +void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; - struct in_device *in_dev; - int w; - - spin_lock_bh(&fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); - if (nexthop_nh->nh_flags & RTNH_F_DEAD) - continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nexthop_nh->nh_flags & RTNH_F_LINKDOWN) - continue; - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } endfor_nexthops(fi); - fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } - } - - /* w should be random number [0..fi->fib_power-1], - * it is pretty bad approximation. - */ - - w = jiffies % fi->fib_power; + for_nexthops(fi) { + if (hash > atomic_read(&nh->nh_upper_bound)) + continue; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && - nexthop_nh->nh_power) { - w -= nexthop_nh->nh_power; - if (w <= 0) { - nexthop_nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); - return; - } - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); } #endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 76ca4e7..0cca444 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1658,8 +1658,12 @@ static int ip_mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(res); + if (res->fi && res->fi->fib_nhs > 1) { + int h; + + h = fib_multipath_hash(saddr, daddr); + fib_select_multipath(res, h); + } #endif /* create a routing cache entry */ @@ -2189,8 +2193,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res); + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + int h; + + h = fib_multipath_hash(fl4->saddr, fl4->daddr); + fib_select_multipath(&res, h); + } else #endif if (!res.prefixlen && -- cgit v1.1 From 79a131592dbb81a2dba208622a2ffbfc53f28bc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20N=C3=B8rlund?= Date: Wed, 30 Sep 2015 10:12:22 +0200 Subject: ipv4: ICMP packet inspection for multipath MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ICMP packets are inspected to let them route together with the flow they belong to, minimizing the chance that a problematic path will affect flows on other paths, and so that anycast environments can work with ECMP. Signed-off-by: Peter Nørlund Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 19 +++++++++++++++++- net/ipv4/route.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 6b96dee..36e2697 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -440,6 +440,22 @@ out_unlock: icmp_xmit_unlock(sk); } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* Source and destination is swapped. See ip_multipath_icmp_hash */ +static int icmp_multipath_hash_skb(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + return fib_multipath_hash(iph->daddr, iph->saddr); +} + +#else + +#define icmp_multipath_hash_skb(skb) (-1) + +#endif + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -464,7 +480,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key_hash(net, fl4, + icmp_multipath_hash_skb(skb_in)); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0cca444..54297d3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1651,6 +1651,48 @@ out: return err; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* To make ICMP packets follow the right flow, the multipath hash is + * calculated from the inner IP addresses in reverse order. + */ +static int ip_multipath_icmp_hash(struct sk_buff *skb) +{ + const struct iphdr *outer_iph = ip_hdr(skb); + struct icmphdr _icmph; + const struct icmphdr *icmph; + struct iphdr _inner_iph; + const struct iphdr *inner_iph; + + if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) + goto standard_hash; + + icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), + &_icmph); + if (!icmph) + goto standard_hash; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + goto standard_hash; + } + + inner_iph = skb_header_pointer(skb, + outer_iph->ihl * 4 + sizeof(_icmph), + sizeof(_inner_iph), &_inner_iph); + if (!inner_iph) + goto standard_hash; + + return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); + +standard_hash: + return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); +} + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, const struct flowi4 *fl4, @@ -1661,7 +1703,10 @@ static int ip_mkroute_input(struct sk_buff *skb, if (res->fi && res->fi->fib_nhs > 1) { int h; - h = fib_multipath_hash(saddr, daddr); + if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) + h = ip_multipath_icmp_hash(skb); + else + h = fib_multipath_hash(saddr, daddr); fib_select_multipath(res, h); } #endif @@ -2030,7 +2075,8 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + int mp_hash) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2194,10 +2240,9 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { - int h; - - h = fib_multipath_hash(fl4->saddr, fl4->daddr); - fib_select_multipath(&res, h); + if (mp_hash < 0) + mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr); + fib_select_multipath(&res, mp_hash); } else #endif @@ -2220,7 +2265,7 @@ out: rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { -- cgit v1.1 From 84b00607aeb8f139a11c93036e1c0ee03dde5634 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Sep 2015 13:26:36 +0200 Subject: mac80211: use ktime_get_seconds The mac80211 code uses ktime_get_ts to measure the connected time. As this uses monotonic time, it is y2038 safe on 32-bit systems, but we still want to deprecate the use of 'timespec' because most other users are broken. This changes the code to use ktime_get_seconds() instead, which avoids the timespec structure and is slightly more efficient. Signed-off-by: Arnd Bergmann Cc: Johannes Berg Cc: linux-wireless@vger.kernel.org Signed-off-by: David S. Miller --- net/mac80211/sta_info.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 64f1936..c364445 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -303,7 +303,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; - struct timespec uptime; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); @@ -339,8 +338,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; - ktime_get_ts(&uptime); - sta->last_connected = uptime.tv_sec; + sta->last_connected = ktime_get_seconds(); ewma_signal_init(&sta->avg_signal); for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) ewma_signal_init(&sta->chain_signal_avg[i]); @@ -1813,7 +1811,6 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; - struct timespec uptime; u32 thr = 0; int i, ac; @@ -1838,8 +1835,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) BIT(NL80211_STA_INFO_RX_DROP_MISC) | BIT(NL80211_STA_INFO_BEACON_LOSS); - ktime_get_ts(&uptime); - sinfo->connected_time = uptime.tv_sec - sta->last_connected; + sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | -- cgit v1.1 From f6389ecbc5f3ddc5860aab22bd7f7e1a8aeb3165 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Sep 2015 13:26:38 +0200 Subject: nfnetlink: use y2038 safe timestamp The __build_packet_message function fills a nfulnl_msg_packet_timestamp structure that uses 64-bit seconds and is therefore y2038 safe, but it uses an intermediate 'struct timespec' which is not. This trivially changes the code to use 'struct timespec64' instead, to correct the result on 32-bit architectures. Signed-off-by: Arnd Bergmann Cc: Pablo Neira Ayuso Cc: Patrick McHardy Cc: Jozsef Kadlecsik Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nfnetlink_log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 4670821..cc2300f 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -538,9 +538,9 @@ __build_packet_message(struct nfnl_log_net *log, if (skb->tstamp.tv64) { struct nfulnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(skb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); + struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + ts.sec = cpu_to_be64(kts.tv_sec); + ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; -- cgit v1.1 From 3dd7669f1f13772d0a846dee58379399f163729c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Sep 2015 13:26:39 +0200 Subject: ipv6: use ktime_t for internal timestamps The ipv6 mip6 implementation is one of only a few users of the skb_get_timestamp() function in the kernel, which is both unsafe on 32-bit architectures because of the 2038 overflow, and slightly less efficient than the skb_get_ktime() based approach. This converts the function call and the mip6_report_rate_limiter structure that stores the time stamp, eliminating all uses of timeval in the ipv6 code. Signed-off-by: Arnd Bergmann Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/mip6.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index b9779d4..60c79a0 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -118,7 +118,7 @@ static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) struct mip6_report_rate_limiter { spinlock_t lock; - struct timeval stamp; + ktime_t stamp; int iif; struct in6_addr src; struct in6_addr dst; @@ -184,20 +184,18 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static inline int mip6_report_rl_allow(struct timeval *stamp, +static inline int mip6_report_rl_allow(ktime_t stamp, const struct in6_addr *dst, const struct in6_addr *src, int iif) { int allow = 0; spin_lock_bh(&mip6_report_rl.lock); - if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec || - mip6_report_rl.stamp.tv_usec != stamp->tv_usec || + if (!ktime_equal(mip6_report_rl.stamp, stamp) || mip6_report_rl.iif != iif || !ipv6_addr_equal(&mip6_report_rl.src, src) || !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { - mip6_report_rl.stamp.tv_sec = stamp->tv_sec; - mip6_report_rl.stamp.tv_usec = stamp->tv_usec; + mip6_report_rl.stamp = stamp; mip6_report_rl.iif = iif; mip6_report_rl.src = *src; mip6_report_rl.dst = *dst; @@ -216,7 +214,7 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; - struct timeval stamp; + ktime_t stamp; int err = 0; if (unlikely(fl6->flowi6_proto == IPPROTO_MH && @@ -230,9 +228,9 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, (skb_network_header(skb) + offset); } - skb_get_timestamp(skb, &stamp); + stamp = skb_get_ktime(skb); - if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr, + if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr, hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; -- cgit v1.1 From 3ef0a25bf9ef318615c810e24d244d55c09806d7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Sep 2015 13:26:40 +0200 Subject: net: sctp: avoid incorrect time_t use We want to avoid using time_t in the kernel because of the y2038 overflow problem. The use in sctp is not for storing seconds at all, but instead uses microseconds and is passed as 32-bit on all machines. This patch changes the type to u32, which better fits the use. Signed-off-by: Arnd Bergmann Cc: Vlad Yasevich Cc: Neil Horman Cc: linux-sctp@vger.kernel.org Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_statefuns.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 7954c52..763e06a 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2494,7 +2494,7 @@ static int sctp_process_param(struct sctp_association *asoc, __u16 sat; int retval = 1; sctp_scope_t scope; - time_t stale; + u32 stale; struct sctp_af *af; union sctp_addr_param *addr_param; struct sctp_transport *t; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d7eaa73..6f46aa1 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2306,7 +2306,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - time_t stale; + u32 stale; sctp_cookie_preserve_param_t bht; sctp_errhdr_t *err; struct sctp_chunk *reply; -- cgit v1.1 From ac8cfc7bb836835bd68c3ab9da242747e9df9542 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Sep 2015 06:18:23 -0700 Subject: tcp: restore fastopen operations I accidentally cleared fastopenq.max_qlen in reqsk_queue_alloc() while max_qlen can be set before listen() is called, using TCP_FASTOPEN socket option for example. Fixes: 0536fcc039a8 ("tcp: prepare fastopen code for upcoming listener changes") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/request_sock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 15c8538..5d26056 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -45,7 +45,6 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; - queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; } -- cgit v1.1 From 3b20fc389705a4c959adebc494578cb99bb8be9e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 30 Sep 2015 16:54:07 -0400 Subject: RDS: Use a single TCP socket for both send and receive. Commit f711a6ae062c ("net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection.") modified rds-tcp so that an incoming SYN would ignore an existing "client" TCP connection which had the local port set to the transient port. The motivation for ignoring the existing "client" connection in f711a6ae was to avoid race conditions and an endless duel of reconnect attempts triggered by a restart/abort of one of the nodes in the TCP connection. However, having separate sockets for active and passive sides is avoidable, and the simpler model of a single TCP socket for both send and receives of all RDS connections associated with that tcp socket makes for easier observability. We avoid the race conditions from f711a6ae by attempting reconnects in rds_conn_shutdown if, and only if, the (new) c_outgoing bit is set for RDS_TRANS_TCP. The c_outgoing bit is initialized in __rds_conn_create(). A side-effect of re-using the client rds_connection for an incoming SYN is the potential of encountering duelling SYNs, i.e., we have an outgoing RDS_CONN_CONNECTING socket when we get the incoming SYN. The logic to arbitrate this criss-crossing SYN exchange in rds_tcp_accept_one() has been modified to emulate the BGP state machine: the smaller IP address should back off from the connection attempt. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 22 ++++++---------------- net/rds/rds.h | 4 +++- net/rds/tcp_listen.c | 22 +++++++++------------- 3 files changed, 18 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 49adeef..d456403 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -128,10 +128,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_transport *loop_trans; unsigned long flags; int ret; - struct rds_transport *otrans = trans; - if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) - goto new_conn; rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -147,7 +144,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (conn) goto out; -new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); @@ -207,6 +203,7 @@ new_conn: atomic_set(&conn->c_state, RDS_CONN_DOWN); conn->c_send_gen = 0; + conn->c_outgoing = (is_outgoing ? 1 : 0); conn->c_reconnect_jiffies = 0; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); @@ -243,22 +240,13 @@ new_conn: /* Creating normal conn */ struct rds_connection *found; - if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) - found = NULL; - else - found = rds_conn_lookup(net, head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || - (otrans->t_type != RDS_TRANS_TCP)) { - /* Only the active side should be added to - * reconnect list for TCP. - */ - hlist_add_head_rcu(&conn->c_hash_node, head); - } + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } @@ -337,7 +325,9 @@ void rds_conn_shutdown(struct rds_connection *conn) rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); - rds_queue_reconnect(conn); + if (conn->c_trans->t_type != RDS_TRANS_TCP || + conn->c_outgoing == 1) + rds_queue_reconnect(conn); } else { rcu_read_unlock(); } diff --git a/net/rds/rds.h b/net/rds/rds.h index afb4048..b4c7ac0 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -86,7 +86,9 @@ struct rds_connection { struct hlist_node c_hash_node; __be32 c_laddr; __be32 c_faddr; - unsigned int c_loopback:1; + unsigned int c_loopback:1, + c_outgoing:1, + c_pad_to_32:30; struct rds_connection *c_passive; struct rds_cong_map *c_lcong; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 444d78d..1d90240 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -110,28 +110,24 @@ int rds_tcp_accept_one(struct socket *sock) goto out; } /* An incoming SYN request came in, and TCP just accepted it. - * We always create a new conn for listen side of TCP, and do not - * add it to the c_hash_list. * * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - WARN_ON(!rs_tcp || rs_tcp->t_sock); - - /* - * see the comment above rds_queue_delayed_reconnect() - */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - if (rds_conn_state(conn) == RDS_CONN_UP) - rds_tcp_stats_inc(s_tcp_listen_closed_stale); - else - rds_tcp_stats_inc(s_tcp_connect_raced); - rds_conn_drop(conn); + if (rs_tcp->t_sock && + ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + struct sock *nsk = new_sock->sk; + + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); + new_sock = NULL; ret = 0; goto out; } + rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); rds_tcp_set_callbacks(new_sock, conn); rds_connect_complete(conn); new_sock = NULL; -- cgit v1.1 From 1edd6a14d24f21b8b478970c63a243a08e2b55b0 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 30 Sep 2015 16:54:08 -0400 Subject: RDS-TCP: Do not bloat sndbuf/rcvbuf in rds_tcp_tune Using the value of RDS_TCP_DEFAULT_BUFSIZE (128K) clobbers efficient use of TSO because it inflates the size_goal that is computed in tcp_sendmsg/tcp_sendpage and skews packet latency, and the default values for these parameters actually results in significantly better performance. In request-response tests using rds-stress with a packet size of 100K with 16 threads (test parameters -q 100000 -a 256 -t16 -d16) between a single pair of IP addresses achieves a throughput of 6-8 Gbps. Without this patch, throughput maxes at 2-3 Gbps under equivalent conditions on these platforms. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c42b60b..9d6ddba 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -67,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock) set_fs(oldfs); } +/* All module specific customizations to the RDS-TCP socket should be done in + * rds_tcp_tune() and applied after socket creation. In general these + * customizations should be tunable via module_param() + */ void rds_tcp_tune(struct socket *sock) { - struct sock *sk = sock->sk; - rds_tcp_nonagle(sock); - - /* - * We're trying to saturate gigabit with the default, - * see svc_sock_setbufsize(). - */ - lock_sock(sk); - sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; - sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; - sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; - release_sock(sk); } u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) -- cgit v1.1 From 76b29ef120f5b845f862de08b92c7d2317b50907 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 30 Sep 2015 16:54:09 -0400 Subject: RDS-TCP: Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in rds_tcp_xmit For the same reasons as commit 2f5338442425 ("tcp: allow splice() to build full TSO packets") and commit 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push() once"), rds_tcp_xmit may have multiple pages to send, so use the MSG_MORE and MSG_SENDPAGE_NOTLAST as hints to tcp_sendpage() Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp_send.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 53b17ca..2894e60 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, struct rds_tcp_connection *tc = conn->c_transport_data; int done = 0; int ret = 0; + int more; if (hdr_off == 0) { /* @@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } + more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0; while (sg < rm->data.op_nents) { + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; + ret = tc->t_sock->ops->sendpage(tc->t_sock, sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, - MSG_DONTWAIT|MSG_NOSIGNAL); + flags); rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); @@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, off = 0; sg++; } + if (sg == rm->data.op_nents - 1) + more = 0; } out: -- cgit v1.1 From bab18991871545dfbd10c931eb0fe8f7637156a9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Oct 2015 15:17:33 +0200 Subject: bpf, seccomp: prepare for upcoming criu support The current ongoing effort to dump existing cBPF seccomp filters back to user space requires to hold the pre-transformed instructions like we do in case of socket filters from sk_attach_filter() side, so they can be reloaded in original form at a later point in time by utilities such as criu. To prepare for this, simply extend the bpf_prog_create_from_user() API to hold a flag that tells whether we should store the original or not. Also, fanout filters could make use of that in future for things like diag. While fanout filters already use bpf_prog_destroy(), move seccomp over to them as well to handle original programs when present. Signed-off-by: Daniel Borkmann Cc: Tycho Andersen Cc: Pavel Emelyanov Cc: Kees Cook Cc: Andy Lutomirski Cc: Alexei Starovoitov Tested-by: Tycho Andersen Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 16 +++++++++++----- net/packet/af_packet.c | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 53a5036..da3e535 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1084,16 +1084,18 @@ EXPORT_SYMBOL_GPL(bpf_prog_create); * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler + * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, - bpf_aux_classic_check_t trans) + bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; + int err; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) @@ -1109,12 +1111,16 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, } fp->len = fprog->len; - /* Since unattached filters are not copied back to user - * space through sk_get_filter(), we do not need to hold - * a copy here, and can spare us the work. - */ fp->orig_prog = NULL; + if (save_orig) { + err = bpf_prog_store_orig_filter(fp, fprog); + if (err) { + __bpf_prog_free(fp); + return -ENOMEM; + } + } + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index aa4b15c..81c900f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1567,7 +1567,7 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, if (copy_from_user(&fprog, data, len)) return -EFAULT; - ret = bpf_prog_create_from_user(&new, &fprog, NULL); + ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; -- cgit v1.1 From 5edfcee5ed73eb9537987c4ddb6bf062b6943b73 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Fri, 25 Sep 2015 08:42:00 +0200 Subject: mac80211: make ieee80211_new_mesh_header return unsigned The function returns always non-negative values. The problem has been detected using proposed semantic patch scripts/coccinelle/tests/assign_signed_to_unsigned.cocci [1]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2046107 Signed-off-by: Andrzej Hajda Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 6 +++--- net/mac80211/mesh.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 62b3e29..626e8de 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -606,9 +606,9 @@ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, * * Return the header length. */ -int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, - struct ieee80211s_hdr *meshhdr, - const char *addr4or5, const char *addr6) +unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, + struct ieee80211s_hdr *meshhdr, + const char *addr4or5, const char *addr6) { if (WARN_ON(!addr4or5 && addr6)) return 0; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index c60be85..a159634 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -207,9 +207,9 @@ struct mesh_rmc { /* Various */ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, const u8 *da, const u8 *sa); -int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, - struct ieee80211s_hdr *meshhdr, - const char *addr4or5, const char *addr6); +unsigned int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata, + struct ieee80211s_hdr *meshhdr, + const char *addr4or5, const char *addr6); int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, const u8 *addr, struct ieee80211s_hdr *mesh_hdr); bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, -- cgit v1.1 From 4bebdd7a4d2960b2ff6c40b27156d041ea270765 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 10 Sep 2015 11:57:14 -0700 Subject: RDS: defer the over_batch work to send worker Current process gives up if its send work over the batch limit. The work queue will get kicked to finish off any other requests. This fixes remainder condition from commit 443be0e5affe ("RDS: make sure not to loop forever inside rds_send_xmit"). The restart condition is only for the case where we reached to over_batch code for some other reason so just retrying again before giving up. While at it, make sure we use already available 'send_batch_count' parameter instead of magic value. The batch count threshold value of 1024 came via commit 443be0e5affe ("RDS: make sure not to loop forever inside rds_send_xmit"). The idea is to process as big a batch as we can but at the same time we don't hold other waiting processes for send. Hence back-off after the send_batch_count limit (1024) to avoid soft-lock ups. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/send.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 4df61a5..b0acd45 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "rds.h" @@ -51,7 +52,7 @@ * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ -static int send_batch_count = 64; +static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); @@ -223,7 +224,7 @@ restart: * through a lot of messages, lets back off and see * if anyone else jumps in */ - if (batch_count >= 1024) + if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&conn->c_lock, flags); @@ -423,7 +424,9 @@ over_batch: !list_empty(&conn->c_send_queue)) && send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); - goto restart; + if (batch_count < send_batch_count) + goto restart; + queue_delayed_work(rds_wq, &conn->c_send_w, 1); } } out: -- cgit v1.1 From db6526dcb51b054961a2d96ba43dec23e38818b3 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 11 Sep 2015 15:44:29 -0700 Subject: RDS: use rds_send_xmit() state instead of RDS_LL_SEND_FULL In Transport indepedent rds_sendmsg(), we shouldn't make decisions based on RDS_LL_SEND_FULL which is used to manage the ring for RDMA based transports. We can safely issue rds_send_xmit() and the using its return value take decision on deferred work. This will also fix the scenario where at times we are seeing connections stuck with the LL_SEND_FULL bit getting set and never cleared. We kick krdsd after any time we see -ENOMEM or -EAGAIN from the ring allocation code. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/send.c | 10 ++++++---- net/rds/threads.c | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index b0acd45..a081a64 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1123,8 +1123,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) */ rds_stats_inc(s_send_queued); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return payload_len; @@ -1180,8 +1181,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return 0; diff --git a/net/rds/threads.c b/net/rds/threads.c index dc2402e..454aa6d 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -162,7 +162,9 @@ void rds_send_worker(struct work_struct *work) int ret; if (rds_conn_state(conn) == RDS_CONN_UP) { + clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); ret = rds_send_xmit(conn); + cond_resched(); rdsdebug("conn %p ret %d\n", conn, ret); switch (ret) { case -EAGAIN: -- cgit v1.1 From f4f943c958a2869b0601092857c1cf0e485d3ce8 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sun, 6 Sep 2015 02:18:51 -0400 Subject: RDS: IB: ack more receive completions to improve performance For better performance, we split the receive completion IRQ handler. That lets us acknowledge several WCE events in one call. We also limit the WC to max 32 to avoid latency. Acknowledging several completions in one call instead of several calls each time will provide better performance since less mutual exclusion locks are being performed. In next patch, send completion is also split which re-uses the poll_cq() and hence the code is moved to ib_cm.c Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 28 +++++++++-- net/rds/ib_cm.c | 70 ++++++++++++++++++++++++++- net/rds/ib_recv.c | 136 +++++++++++++++-------------------------------------- net/rds/ib_stats.c | 3 +- 4 files changed, 132 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index f1fd5ffec..727759b 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -24,6 +24,8 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 +#define RDS_IB_WC_MAX 32 + extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -89,6 +91,20 @@ struct rds_ib_work_ring { atomic_t w_free_ctr; }; +/* Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + + struct rds_ib_device; struct rds_ib_connection { @@ -102,6 +118,10 @@ struct rds_ib_connection { struct ib_pd *i_pd; struct ib_cq *i_send_cq; struct ib_cq *i_recv_cq; + struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; + + /* interrupt handling */ + struct tasklet_struct i_recv_tasklet; /* tx */ struct rds_ib_work_ring i_send_ring; @@ -112,7 +132,6 @@ struct rds_ib_connection { atomic_t i_signaled_sends; /* rx */ - struct tasklet_struct i_recv_tasklet; struct mutex i_recv_mutex; struct rds_ib_work_ring i_recv_ring; struct rds_ib_incoming *i_ibinc; @@ -199,13 +218,14 @@ struct rds_ib_statistics { uint64_t s_ib_connect_raced; uint64_t s_ib_listen_closed_stale; uint64_t s_ib_tx_cq_call; + uint64_t s_ib_evt_handler_call; + uint64_t s_ib_tasklet_call; uint64_t s_ib_tx_cq_event; uint64_t s_ib_tx_ring_full; uint64_t s_ib_tx_throttle; uint64_t s_ib_tx_sg_mapping_failure; uint64_t s_ib_tx_stalled; uint64_t s_ib_tx_credit_updates; - uint64_t s_ib_rx_cq_call; uint64_t s_ib_rx_cq_event; uint64_t s_ib_rx_ring_empty; uint64_t s_ib_rx_refill_from_cq; @@ -324,7 +344,8 @@ void rds_ib_recv_free_caches(struct rds_ib_connection *ic); void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc, + struct rds_ib_ack_state *state); void rds_ib_recv_tasklet_fn(unsigned long data); void rds_ib_recv_init_ring(struct rds_ib_connection *ic); void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); @@ -332,6 +353,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic); void rds_ib_attempt_ack(struct rds_ib_connection *ic); void rds_ib_ack_send_complete(struct rds_ib_connection *ic); u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required); /* ib_ring.c */ void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 9043f5c..28e0979 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -216,6 +216,72 @@ static void rds_ib_cq_event_handler(struct ib_event *event, void *data) event->event, ib_event_msg(event->event), data); } +/* Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr; + int i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + wc->byte_len, be32_to_cpu(wc->ex.imm_data)); + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + +static void rds_ib_tasklet_fn_recv(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *)data; + struct rds_connection *conn = ic->conn; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + struct rds_ib_ack_state state; + + BUG_ON(!rds_ibdev); + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&state, 0, sizeof(state)); + poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); +} + static void rds_ib_qp_event_handler(struct ib_event *event, void *data) { struct rds_connection *conn = data; @@ -282,7 +348,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) } cq_attr.cqe = ic->i_recv_ring.w_nr; - ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_recv_cq)) { @@ -743,7 +809,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) } INIT_LIST_HEAD(&ic->ib_node); - tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, + tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv, (unsigned long) ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index f43831e..96744b7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -596,8 +596,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) * wr_id and avoids working with the ring in that case. */ #ifndef KERNEL_HAS_ATOMIC64 -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, - int ack_required) +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { unsigned long flags; @@ -622,8 +621,7 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic) return seq; } #else -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, - int ack_required) +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { @@ -830,20 +828,6 @@ static void rds_ib_cong_recv(struct rds_connection *conn, rds_cong_map_updated(map, uncongested); } -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_ib_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - static void rds_ib_process_recv(struct rds_connection *conn, struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) @@ -969,96 +953,50 @@ static void rds_ib_process_recv(struct rds_connection *conn, } } -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p cq %p\n", conn, cq); - - rds_ib_stats_inc(s_ib_rx_cq_call); - - tasklet_schedule(&ic->i_recv_tasklet); -} - -static inline void rds_poll_cq(struct rds_ib_connection *ic, - struct rds_ib_ack_state *state) +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, + struct ib_wc *wc, + struct rds_ib_ack_state *state) { struct rds_connection *conn = ic->conn; - struct ib_wc wc; struct rds_ib_recv_work *recv; - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - ib_wc_status_msg(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_rx_cq_event); + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); - recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - - ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { - /* We expect errors as the qp is drained during shutdown */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had " - "status %u (%s), disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status, - ib_wc_status_msg(wc.status)); - } + rds_ib_stats_inc(s_ib_rx_cq_event); + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, + DMA_FROM_DEVICE); - /* - * rds_ib_process_recv() doesn't always consume the frag, and - * we might not have called it at all if the wc didn't indicate - * success. We already unmapped the frag's pages, though, and - * the following rds_ib_ring_free() call tells the refill path - * that it will not find an allocated frag here. Make sure we - * keep that promise by freeing a frag that's still on the ring. - */ - if (recv->r_frag) { - rds_ib_frag_free(ic, recv->r_frag); - recv->r_frag = NULL; - } - rds_ib_ring_free(&ic->i_recv_ring, 1); + /* Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (wc->status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc->byte_len, state); + } else { + /* We expect errors as the qp is drained during shutdown */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, + wc->status, + ib_wc_status_msg(wc->status)); } -} -void rds_ib_recv_tasklet_fn(unsigned long data) -{ - struct rds_ib_connection *ic = (struct rds_ib_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state = { 0, }; - - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); - - if (state.ack_next_valid) - rds_ib_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; + /* rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and + * the following rds_ib_ring_free() call tells the refill path + * that it will not find an allocated frag here. Make sure we + * keep that promise by freeing a frag that's still on the ring. + */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; } - if (rds_conn_up(conn)) - rds_ib_attempt_ack(ic); + rds_ib_ring_free(&ic->i_recv_ring, 1); /* If we ever end up with a really empty receive ring, we're * in deep trouble, as the sender will definitely see RNR diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 2d5965d..bdf6115 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -42,14 +42,15 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", + "s_ib_evt_handler_call", "ib_tx_cq_call", + "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", - "ib_rx_cq_call", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", -- cgit v1.1 From 0c28c04500cf956c82d542c199f5bddabd590af3 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sun, 6 Sep 2015 02:18:51 -0400 Subject: RDS: IB: split send completion handling and do batch ack Similar to what we did with receive CQ completion handling, we split the transmit completion handler so that it lets us implement batched work completion handling. We re-use the cq_poll routine and makes use of RDS_IB_SEND_OP to identify the send vs receive completion event handler invocation. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 6 ++- net/rds/ib_cm.c | 45 ++++++++++++++++++++-- net/rds/ib_send.c | 110 +++++++++++++++++++++++++---------------------------- net/rds/ib_stats.c | 1 - net/rds/send.c | 1 + 5 files changed, 98 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 727759b..3a8cd31 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -25,6 +25,7 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 #define RDS_IB_WC_MAX 32 +#define RDS_IB_SEND_OP BIT_ULL(63) extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -118,9 +119,11 @@ struct rds_ib_connection { struct ib_pd *i_pd; struct ib_cq *i_send_cq; struct ib_cq *i_recv_cq; + struct ib_wc i_send_wc[RDS_IB_WC_MAX]; struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; /* interrupt handling */ + struct tasklet_struct i_send_tasklet; struct tasklet_struct i_recv_tasklet; /* tx */ @@ -217,7 +220,6 @@ struct rds_ib_device { struct rds_ib_statistics { uint64_t s_ib_connect_raced; uint64_t s_ib_listen_closed_stale; - uint64_t s_ib_tx_cq_call; uint64_t s_ib_evt_handler_call; uint64_t s_ib_tasklet_call; uint64_t s_ib_tx_cq_event; @@ -371,7 +373,7 @@ extern wait_queue_head_t rds_ib_ring_empty_wait; void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 28e0979..8f51d0d 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -250,11 +250,34 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", (unsigned long long)wc->wr_id, wc->status, wc->byte_len, be32_to_cpu(wc->ex.imm_data)); - rds_ib_recv_cqe_handler(ic, wc, ack_state); + + if (wc->wr_id & RDS_IB_SEND_OP) + rds_ib_send_cqe_handler(ic, wc); + else + rds_ib_recv_cqe_handler(ic, wc, ack_state); } } } +static void rds_ib_tasklet_fn_send(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *)data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state state; + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&state, 0, sizeof(state)); + poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + + if (rds_conn_up(conn) && + (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued))) + rds_send_xmit(ic->conn); +} + static void rds_ib_tasklet_fn_recv(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; @@ -304,6 +327,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) } } +static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_send_tasklet); +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -337,7 +372,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_pd = rds_ibdev->pd; cq_attr.cqe = ic->i_send_ring.w_nr + 1; - ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + + ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_send_cq)) { @@ -703,6 +739,7 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); /* first destroy the ib state that generates callbacks */ @@ -809,8 +846,10 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) } INIT_LIST_HEAD(&ic->ib_node); + tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send, + (unsigned long)ic); tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv, - (unsigned long) ic); + (unsigned long)ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 4e88047..670882c 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i; + send->s_wr.wr_id = i | RDS_IB_SEND_OP; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; @@ -237,81 +237,73 @@ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) * unallocs the next free entry in the ring it doesn't alter which is * the next to be freed, which is what this is concerned with. */ -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) { - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; struct rds_message *rm = NULL; - struct ib_wc wc; + struct rds_connection *conn = ic->conn; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; - int ret; int nr_sig = 0; - rdsdebug("cq %p conn %p\n", cq, conn); - rds_ib_stats_inc(s_ib_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - ib_wc_status_msg(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_tx_cq_event); - - if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (time_after(jiffies, ic->i_ack_queued + HZ/2)) - rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ack_send_complete(ic); - continue; - } - oldest = rds_ib_ring_oldest(&ic->i_send_ring); + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); - completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + if (wc->wr_id == RDS_IB_ACK_WR_ID) { + if (time_after(jiffies, ic->i_ack_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + return; + } - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - if (send->s_wr.send_flags & IB_SEND_SIGNALED) - nr_sig++; + oldest = rds_ib_ring_oldest(&ic->i_send_ring); - rm = rds_ib_send_unmap_op(ic, send, wc.status); + completed = rds_ib_ring_completed(&ic->i_send_ring, + (wc->wr_id & ~RDS_IB_SEND_OP), + oldest); - if (time_after(jiffies, send->s_queued + HZ/2)) - rds_ib_stats_inc(s_ib_tx_stalled); + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - if (send->s_op) { - if (send->s_op == rm->m_final_op) { - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - } - rds_message_put(rm); - send->s_op = NULL; - } + rm = rds_ib_send_unmap_op(ic, send, wc->status); - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } + if (time_after(jiffies, send->s_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ring_free(&ic->i_send_ring, completed); - rds_ib_sub_signaled(ic, nr_sig); - nr_sig = 0; - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status " - "%u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc.status, - ib_wc_status_msg(wc.status)); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get + * flushed out, wake them up now + */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc->status, + ib_wc_status_msg(wc->status)); } } diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index bdf6115..8c8b84f 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -43,7 +43,6 @@ static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", "s_ib_evt_handler_call", - "ib_tx_cq_call", "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", diff --git a/net/rds/send.c b/net/rds/send.c index a081a64..ee49c25 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -432,6 +432,7 @@ over_batch: out: return ret; } +EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { -- cgit v1.1 From 9441c973e1e0e9885537a3a86020fe8e121e9a98 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 19 Sep 2015 14:01:09 -0400 Subject: RDS: IB: handle rds_ibdev release case instead of crashing the kernel Just in case we are still handling the QP receive completion while the rds_ibdev is released, drop the connection instead of crashing the kernel. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib_cm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 8f51d0d..2b2370e 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -285,7 +285,8 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) struct rds_ib_device *rds_ibdev = ic->rds_ibdev; struct rds_ib_ack_state state; - BUG_ON(!rds_ibdev); + if (!rds_ibdev) + rds_conn_drop(conn); rds_ib_stats_inc(s_ib_tasklet_call); -- cgit v1.1 From 2e1d6b813ac146db1e33ebb9b90441012dde4952 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sun, 13 Sep 2015 22:34:37 -0700 Subject: RDS: IB: fix the rds_ib_fmr_wq kick call RDS IB mr pool has its own workqueue 'rds_ib_fmr_wq', so we need to use queue_delayed_work() to kick the work. This was hurting the performance since pool maintenance was less often triggered from other path. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 872f523..b6644fa 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -319,7 +319,7 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) int err = 0, iter = 0; if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); while (1) { ibmr = rds_ib_reuse_fmr(pool); -- cgit v1.1 From 26139dc1dbf79fd1ae1e2766a1f66b0728bd67b3 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 15 Sep 2015 18:20:35 -0700 Subject: RDS: IB: use already available pool handle from ibmr rds_ib_mr already keeps the pool handle which it associates with. Lets use that instead of round about way of fetching it from rds_ib_device. No functional change. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib_rdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index b6644fa..52d889a 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -522,8 +522,7 @@ static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) __rds_ib_teardown_mr(ibmr); if (pinned) { - struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } @@ -717,8 +716,8 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work) void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); -- cgit v1.1 From 67161e250a28de5cdafa99a3b659b1e2e269fd7e Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 19 Sep 2015 17:21:22 -0400 Subject: RDS: IB: mark rds_ib_fmr_wq static Fix below warning by marking rds_ib_fmr_wq static net/rds/ib_rdma.c:87:25: warning: symbol 'rds_ib_fmr_wq' was not declared. Should it be static? Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 52d889a..bb62024 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -83,7 +83,7 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; -struct workqueue_struct *rds_ib_fmr_wq; +static struct workqueue_struct *rds_ib_fmr_wq; int rds_ib_fmr_init(void) { -- cgit v1.1 From 41a4e9646229801624e38f7a1cc53033a0affdb1 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 19 Sep 2015 13:06:08 -0400 Subject: RDS: IB: use max_mr from HCA caps than max_fmr All HCA drivers seems to popullate max_mr caps and few of them do both max_mr and max_fmr. Hence update RDS code to make use of max_mr. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 2d3f2ab..883813a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -148,8 +148,8 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_fmrs = dev_attr->max_fmr ? - min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : + rds_ibdev->max_fmrs = dev_attr->max_mr ? + min_t(unsigned int, dev_attr->max_mr, fmr_pool_size) : fmr_pool_size; rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; -- cgit v1.1 From 06766513232d1619ac84e87b1d839d3fcc23a540 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 10 Sep 2015 21:20:57 -0700 Subject: RDS: IB: split mr pool to improve 8K messages performance 8K message sizes are pretty important usecase for RDS current workloads so we make provison to have 8K mrs available from the pool. Based on number of SG's in the RDS message, we pick a pool to use. Also to make sure that we don't under utlise mrs when say 8k messages are dominating which could lead to 8k pull being exhausted, we fall-back to 1m pool till 8k pool recovers for use. This helps to at least push ~55 kB/s bidirectional data which is a nice improvement. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar --- net/rds/ib.c | 47 +++++++++++++++++-------- net/rds/ib.h | 43 ++++++++++++++++------- net/rds/ib_rdma.c | 101 +++++++++++++++++++++++++++++++++++++---------------- net/rds/ib_stats.c | 18 ++++++---- 4 files changed, 147 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 883813a..a833ab7 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -43,14 +43,14 @@ #include "rds.h" #include "ib.h" -static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; -unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; +unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; -module_param(fmr_pool_size, int, 0444); -MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); -module_param(fmr_message_size, int, 0444); -MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_fmr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA"); +module_param(rds_ib_fmr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); @@ -97,8 +97,10 @@ static void rds_ib_dev_free(struct work_struct *work) struct rds_ib_device *rds_ibdev = container_of(work, struct rds_ib_device, free_work); - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr_8k_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); + if (rds_ibdev->mr_1m_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); @@ -148,9 +150,13 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_fmrs = dev_attr->max_mr ? - min_t(unsigned int, dev_attr->max_mr, fmr_pool_size) : - fmr_pool_size; + rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? + min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; + + rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? + min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; @@ -162,12 +168,25 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); - if (IS_ERR(rds_ibdev->mr_pool)) { - rds_ibdev->mr_pool = NULL; + rds_ibdev->mr_1m_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); + if (IS_ERR(rds_ibdev->mr_1m_pool)) { + rds_ibdev->mr_1m_pool = NULL; goto put_dev; } + rds_ibdev->mr_8k_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); + if (IS_ERR(rds_ibdev->mr_8k_pool)) { + rds_ibdev->mr_8k_pool = NULL; + goto put_dev; + } + + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", + dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, + rds_ibdev->max_8k_fmrs); + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); diff --git a/net/rds/ib.h b/net/rds/ib.h index 3a8cd31..f17d095 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -9,8 +9,11 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 8192 +#define RDS_FMR_1M_POOL_SIZE (8192 / 2) +#define RDS_FMR_1M_MSG_SIZE 256 +#define RDS_FMR_8K_MSG_SIZE 2 +#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1)) +#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -189,15 +192,23 @@ struct rds_ib_ipaddr { struct rcu_head rcu; }; +enum { + RDS_IB_MR_8K_POOL, + RDS_IB_MR_1M_POOL, +}; + struct rds_ib_device { struct list_head list; struct list_head ipaddr_list; struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - struct rds_ib_mr_pool *mr_pool; - unsigned int fmr_max_remaps; unsigned int max_fmrs; + struct rds_ib_mr_pool *mr_1m_pool; + struct rds_ib_mr_pool *mr_8k_pool; + unsigned int fmr_max_remaps; + unsigned int max_8k_fmrs; + unsigned int max_1m_fmrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -239,12 +250,18 @@ struct rds_ib_statistics { uint64_t s_ib_ack_send_delayed; uint64_t s_ib_ack_send_piggybacked; uint64_t s_ib_ack_received; - uint64_t s_ib_rdma_mr_alloc; - uint64_t s_ib_rdma_mr_free; - uint64_t s_ib_rdma_mr_used; - uint64_t s_ib_rdma_mr_pool_flush; - uint64_t s_ib_rdma_mr_pool_wait; - uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_rdma_mr_8k_alloc; + uint64_t s_ib_rdma_mr_8k_free; + uint64_t s_ib_rdma_mr_8k_used; + uint64_t s_ib_rdma_mr_8k_pool_flush; + uint64_t s_ib_rdma_mr_8k_pool_wait; + uint64_t s_ib_rdma_mr_8k_pool_depleted; + uint64_t s_ib_rdma_mr_1m_alloc; + uint64_t s_ib_rdma_mr_1m_free; + uint64_t s_ib_rdma_mr_1m_used; + uint64_t s_ib_rdma_mr_1m_pool_flush; + uint64_t s_ib_rdma_mr_1m_pool_wait; + uint64_t s_ib_rdma_mr_1m_pool_depleted; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; }; @@ -296,7 +313,8 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_message_size; +extern unsigned int rds_ib_fmr_1m_pool_size; +extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; @@ -326,7 +344,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, + int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index bb62024..a234074 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -65,6 +65,7 @@ struct rds_ib_mr { * Our own little FMR pool */ struct rds_ib_mr_pool { + unsigned int pool_type; struct mutex flush_lock; /* serialize fmr invalidate */ struct delayed_work flush_worker; /* flush worker */ @@ -234,7 +235,8 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) { struct rds_ib_mr_pool *pool; @@ -242,6 +244,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); + pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); @@ -249,28 +252,30 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - pool->fmr_attr.max_pages = fmr_message_size; + if (pool_type == RDS_IB_MR_1M_POOL) { + /* +1 allows for unaligned MRs */ + pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; + pool->max_items = RDS_FMR_1M_POOL_SIZE; + } else { + /* pool_type == RDS_IB_MR_8K_POOL */ + pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; + pool->max_items = RDS_FMR_8K_POOL_SIZE; + } + + pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - pool->max_items = rds_ibdev->max_fmrs; return pool; } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->fmr_attr.max_pages; + iinfo->rdma_mr_max = pool_1m->max_items; + iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) @@ -312,15 +317,29 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, + int npages) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool; struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (npages <= RDS_FMR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + /* Switch pools if one of the pool is reaching upper limit */ + if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + pool = rds_ibdev->mr_1m_pool; + else + pool = rds_ibdev->mr_8k_pool; + } + while (1) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) @@ -341,12 +360,18 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) atomic_dec(&pool->item_count); if (++iter > 2) { - rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); return ERR_PTR(-EAGAIN); } /* We do have some empty MRs. Flush them out. */ - rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; @@ -371,7 +396,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - rds_ib_stats_inc(s_ib_rdma_mr_alloc); + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + return ibmr; out_no_cigar: @@ -427,7 +457,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm } page_cnt += len >> PAGE_SHIFT; - if (page_cnt > fmr_message_size) + if (page_cnt > ibmr->pool->fmr_attr.max_pages) return -EINVAL; dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, @@ -459,7 +489,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm ibmr->sg_dma_len = sg_dma_len; ibmr->remap_count++; - rds_ib_stats_inc(s_ib_rdma_mr_used); + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); ret = 0; out: @@ -591,7 +624,7 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, * to free as many MRs as needed to get back to this limit. */ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; struct llist_node *clean_nodes; @@ -602,11 +635,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; int ret = 0; - rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); - while(!mutex_trylock(&pool->flush_lock)) { + while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) { *ibmr_ret = ibmr; @@ -663,8 +699,12 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); - if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { - rds_ib_stats_inc(s_ib_rdma_mr_free); + if (nfreed < free_goal || + ibmr->remap_count >= pool->fmr_attr.max_maps) { + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); @@ -756,10 +796,11 @@ void rds_ib_flush_mrs(void) down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + if (rds_ibdev->mr_8k_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); - if (pool) - rds_ib_flush_mr_pool(pool, 0, NULL); + if (rds_ibdev->mr_1m_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } @@ -777,12 +818,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - if (!rds_ibdev->mr_pool) { + if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev); + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); if (IS_ERR(ibmr)) { rds_ib_dev_put(rds_ibdev); return ibmr; diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 8c8b84f..d77e044 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -61,12 +61,18 @@ static const char *const rds_ib_stat_names[] = { "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", - "ib_rdma_mr_alloc", - "ib_rdma_mr_free", - "ib_rdma_mr_used", - "ib_rdma_mr_pool_flush", - "ib_rdma_mr_pool_wait", - "ib_rdma_mr_pool_depleted", + "ib_rdma_mr_8k_alloc", + "ib_rdma_mr_8k_free", + "ib_rdma_mr_8k_used", + "ib_rdma_mr_8k_pool_flush", + "ib_rdma_mr_8k_pool_wait", + "ib_rdma_mr_8k_pool_depleted", + "ib_rdma_mr_1m_alloc", + "ib_rdma_mr_1m_free", + "ib_rdma_mr_1m_used", + "ib_rdma_mr_1m_pool_flush", + "ib_rdma_mr_1m_pool_wait", + "ib_rdma_mr_1m_pool_depleted", "ib_atomic_cswp", "ib_atomic_fadd", }; -- cgit v1.1 From 0a837fe4724713ef701e47d6bfab98a5efaff3eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20N=C3=B8rlund?= Date: Tue, 6 Oct 2015 07:24:47 +0200 Subject: ipv4: Fix compilation errors in fib_rebalance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes net/built-in.o: In function `fib_rebalance': fib_semantics.c:(.text+0x9df14): undefined reference to `__divdi3' and net/built-in.o: In function `fib_rebalance': net/ipv4/fib_semantics.c:572: undefined reference to `__aeabi_ldivmod' Fixes: 0e884c78ee19 ("ipv4: L3 hash-based multipath") Signed-off-by: Peter Nørlund Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0c49d2f..7bd698c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -569,8 +569,8 @@ static void fib_rebalance(struct fib_info *fi) upper_bound = -1; } else { w += nexthop_nh->nh_weight; - upper_bound = DIV_ROUND_CLOSEST(2147483648LL * w, - total) - 1; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; } atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); -- cgit v1.1 From d25b8e74291fec2dbf3fe3df7f20289eeaa9d28f Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 3 Oct 2015 18:09:07 +0100 Subject: net: dsa: better error reporting Add additional error reporting to the generic DSA code, so it's easier to debug when things go wrong. This was useful when initially bringing up 88e6176 on a new board. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 4 ++-- net/dsa/slave.c | 24 ++++++++++++++++-------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index c59fa5d..aa398bc 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -326,8 +326,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = dsa_slave_create(ds, parent, i, pd->port_names[i]); if (ret < 0) { - netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s)\n", - index, i, pd->port_names[i]); + netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", + index, i, pd->port_names[i], ret); ret = 0; } } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5f65f92..4f607bc 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1026,8 +1026,10 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, struct dsa_switch *ds = p->parent; p->phy = ds->slave_mii_bus->phy_map[addr]; - if (!p->phy) + if (!p->phy) { + netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; + } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) @@ -1061,7 +1063,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ ret = of_phy_register_fixed_link(port_dn); if (ret) { - netdev_err(slave_dev, "failed to register fixed PHY\n"); + netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret); return ret; } phy_is_fixed = true; @@ -1072,17 +1074,20 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_flags = ds->drv->get_phy_flags(ds, p->port); if (phy_dn) { - ret = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + /* If this PHY address is part of phys_mii_mask, which means * that we need to divert reads and writes to/from it, then we * want to bind this device using the slave MII bus created by * DSA to make that happen. */ - if (!phy_is_fixed && ret >= 0 && - (ds->phys_mii_mask & (1 << ret))) { - ret = dsa_slave_phy_connect(p, slave_dev, ret); - if (ret) + if (!phy_is_fixed && phy_id >= 0 && + (ds->phys_mii_mask & (1 << phy_id))) { + ret = dsa_slave_phy_connect(p, slave_dev, phy_id); + if (ret) { + netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); return ret; + } } else { p->phy = of_phy_connect(slave_dev, phy_dn, dsa_slave_adjust_link, @@ -1099,8 +1104,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ if (!p->phy) { ret = dsa_slave_phy_connect(p, slave_dev, p->port); - if (ret) + if (ret) { + netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); return ret; + } } else { netdev_info(slave_dev, "attached PHY at address %d [%s]\n", p->phy->addr, p->phy->drv->name); @@ -1212,6 +1219,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { + netdev_err(master, "error %d setting up slave phy\n", ret); free_netdev(slave_dev); return ret; } -- cgit v1.1 From 16660f0bd942cec203eaf4de0e2ac1695bd9d32d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 3 Oct 2015 11:43:46 -0700 Subject: net: Add support for filtering neigh dump by device index Add support for filtering neighbor dumps by device by adding the NDA_IFINDEX attribute to the dump request. Signed-off-by: David Ahern Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/neighbour.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8c57fdf..1aa8437 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2249,6 +2249,14 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) return false; } +static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) +{ + if (filter_idx && dev->ifindex != filter_idx) + return true; + + return false; +} + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { @@ -2259,16 +2267,19 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; - int filter_master_idx = 0; + int filter_master_idx = 0, filter_idx = 0; unsigned int flags = NLM_F_MULTI; int err; err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL); if (!err) { + if (tb[NDA_IFINDEX]) + filter_idx = nla_get_u32(tb[NDA_IFINDEX]); + if (tb[NDA_MASTER]) filter_master_idx = nla_get_u32(tb[NDA_MASTER]); - if (filter_master_idx) + if (filter_idx || filter_master_idx) flags |= NLM_F_DUMP_FILTERED; } @@ -2283,6 +2294,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; + if (neigh_ifindex_filtered(n->dev, filter_idx)) + continue; if (neigh_master_filtered(n->dev, filter_master_idx)) continue; if (idx < s_idx) -- cgit v1.1 From 4917a1548ff41e53d863d6845b4da1884e4282b4 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 5 Oct 2015 12:11:21 +0200 Subject: bridge: netlink: make br_fill_info's frame size smaller When KASAN is enabled the frame size grows > 2048 bytes and we get a warning, so make it smaller. net/bridge/br_netlink.c: In function 'br_fill_info': >> net/bridge/br_netlink.c:1110:1: warning: the frame size of 2160 bytes >> is larger than 2048 bytes [-Wframe-larger-than=] Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 70efe2e..330abf4 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1057,28 +1057,27 @@ static size_t br_get_size(const struct net_device *brdev) static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); - u64 hello_timer, tcn_timer, topology_change_timer, gc_timer, clockval; u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; - u16 group_fwd_mask = br->group_fwd_mask; u8 vlan_enabled = br_vlan_enabled(br); - struct ifla_bridge_id root_id, bridge_id; - - memset(&bridge_id, 0, sizeof(bridge_id)); - memset(&root_id, 0, sizeof(root_id)); - memcpy(root_id.prio, br->designated_root.prio, sizeof(root_id.prio)); - memcpy(root_id.addr, br->designated_root.addr, sizeof(root_id.addr)); - memcpy(bridge_id.prio, br->bridge_id.prio, sizeof(bridge_id.prio)); - memcpy(bridge_id.addr, br->bridge_id.addr, sizeof(bridge_id.addr)); - hello_timer = br_timer_value(&br->hello_timer); - tcn_timer = br_timer_value(&br->tcn_timer); - topology_change_timer = br_timer_value(&br->topology_change_timer); - gc_timer = br_timer_value(&br->gc_timer); - clockval = 0; + u64 clockval; + + clockval = br_timer_value(&br->hello_timer); + if (nla_put_u64(skb, IFLA_BR_HELLO_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->tcn_timer); + if (nla_put_u64(skb, IFLA_BR_TCN_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->topology_change_timer); + if (nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval)) + return -EMSGSIZE; + clockval = br_timer_value(&br->gc_timer); + if (nla_put_u64(skb, IFLA_BR_GC_TIMER, clockval)) + return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || @@ -1087,19 +1086,16 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || - nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, group_fwd_mask) || - nla_put(skb, IFLA_BR_ROOT_ID, sizeof(root_id), &root_id) || - nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(bridge_id), &bridge_id) || + nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || + nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), + &br->bridge_id) || + nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), + &br->designated_root) || nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || - nla_put_u64(skb, IFLA_BR_HELLO_TIMER, hello_timer) || - nla_put_u64(skb, IFLA_BR_TCN_TIMER, tcn_timer) || - nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, - topology_change_timer) || - nla_put_u64(skb, IFLA_BR_GC_TIMER, gc_timer) || nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) return -EMSGSIZE; -- cgit v1.1 From 00a93babd06aaad31d23384cda576ede0f586a8c Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 5 Oct 2015 13:09:46 +0200 Subject: openvswitch: add tunnel protocol to sw_flow_key Store tunnel protocol (AF_INET or AF_INET6) in sw_flow_key. This field now also acts as an indicator whether the flow contains tunnel data (this was previously indicated by tun_key.u.ipv4.dst being set but with IPv6 addresses in an union with IPv4 ones this won't work anymore). The new field was added to a hole in sw_flow_key. Signed-off-by: Jiri Benc Acked-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 4 ++-- net/openvswitch/flow.h | 1 + net/openvswitch/flow_netlink.c | 10 ++++++++-- net/openvswitch/flow_table.c | 2 +- 4 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c8db44a..0ea128e 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -698,8 +698,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, { /* Extract metadata from packet. */ if (tun_info) { - if (ip_tunnel_info_af(tun_info) != AF_INET) - return -EINVAL; + key->tun_proto = ip_tunnel_info_af(tun_info); memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); if (tun_info->options_len) { @@ -714,6 +713,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->tun_opts_len = 0; } } else { + key->tun_proto = 0; key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe527d2..5688e33 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -63,6 +63,7 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u8 tun_proto; /* Protocol of encapsulating tunnel. */ u32 ovs_flow_hash; /* Datapath computed hash value. */ u32 recirc_id; /* Recirculation ID. */ struct { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 5c030a4..6be701f6 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -643,6 +643,10 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, } SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + if (is_mask) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); + else + SW_FLOW_KEY_PUT(match, tun_proto, AF_INET, false); if (rem > 0) { OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", @@ -1194,7 +1198,7 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.u.ipv4.dst) + if (match->key->tun_proto) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1367,7 +1371,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.u.ipv4.dst || is_mask)) { + if ((swkey->tun_proto || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) @@ -1913,6 +1917,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, tun_info = &tun_dst->u.tun_info; tun_info->mode = IP_TUNNEL_INFO_TX; + if (key.tun_proto == AF_INET6) + tun_info->mode |= IP_TUNNEL_INFO_IPV6; tun_info->key = key.tun_key; /* We need to store the options in the action itself since diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index f2ea83b..95dbced 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -427,7 +427,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.u.ipv4.dst) + if (key->tun_proto) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), -- cgit v1.1 From 6b26ba3a7d952e611dcde1f3f77ce63bcc70540a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 5 Oct 2015 13:09:47 +0200 Subject: openvswitch: netlink attributes for IPv6 tunneling Add netlink attributes for IPv6 tunnel addresses. This enables IPv6 support for tunnels. Signed-off-by: Jiri Benc Acked-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 121 ++++++++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 6be701f6..77850f1 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -262,8 +262,8 @@ size_t ovs_tun_key_attr_size(void) * updating this function. */ return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ @@ -323,6 +323,8 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, .next = ovs_vxlan_ext_key_lens }, + [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -542,14 +544,14 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } -static int ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) +static int ip_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) { struct nlattr *a; int rem; bool ttl = false; - __be16 tun_flags = 0; + __be16 tun_flags = 0, ipv4 = false, ipv6 = false; int opts_type = 0; nla_for_each_nested(a, attr, rem) { @@ -578,10 +580,22 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); + ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; break; case OVS_TUNNEL_KEY_ATTR_TOS: SW_FLOW_KEY_PUT(match, tun_key.tos, @@ -636,7 +650,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, opts_type = type; break; default: - OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", + OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); return -EINVAL; } @@ -646,22 +660,36 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, if (is_mask) SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); else - SW_FLOW_KEY_PUT(match, tun_proto, AF_INET, false); + SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, + false); if (rem > 0) { - OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", rem); return -EINVAL; } + if (ipv4 && ipv6) { + OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); + return -EINVAL; + } + if (!is_mask) { - if (!match->key->tun_key.u.ipv4.dst) { + if (!ipv4 && !ipv6) { + OVS_NLERR(log, "IP tunnel dst address not specified"); + return -EINVAL; + } + if (ipv4 && !match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } + if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { + OVS_NLERR(log, "IPv6 tunnel dst address is zero"); + return -EINVAL; + } if (!ttl) { - OVS_NLERR(log, "IPv4 tunnel TTL not specified."); + OVS_NLERR(log, "IP tunnel TTL not specified."); return -EINVAL; } } @@ -686,21 +714,36 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, return 0; } -static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ip_tunnel_key *output, - const void *tun_opts, int swkey_tun_opts_len) +static int __ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->u.ipv4.src && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->u.ipv4.src)) - return -EMSGSIZE; - if (output->u.ipv4.dst && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->u.ipv4.dst)) - return -EMSGSIZE; + switch (tun_proto) { + case AF_INET: + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) + return -EMSGSIZE; + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&output->u.ipv6.src) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, + &output->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&output->u.ipv6.dst) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, + &output->u.ipv6.dst)) + return -EMSGSIZE; + break; + } if (output->tos && nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; @@ -734,9 +777,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ip_tunnel_key *output, - const void *tun_opts, int swkey_tun_opts_len) +static int ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { struct nlattr *nla; int err; @@ -745,7 +789,8 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, + tun_proto); if (err) return err; @@ -757,9 +802,10 @@ int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, const struct ip_tunnel_info *egress_tun_info, const void *egress_tun_opts) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, - egress_tun_opts, - egress_tun_info->options_len); + return __ip_tun_to_nlattr(skb, &egress_tun_info->key, + egress_tun_opts, + egress_tun_info->options_len, + ip_tunnel_info_af(egress_tun_info)); } static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, @@ -810,8 +856,8 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask, log) < 0) + if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } @@ -1377,8 +1423,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); - if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len)) + if (ip_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len, swkey->tun_proto)) goto nla_put_failure; } @@ -1881,7 +1927,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, int err = 0, start, opts_type; ovs_match_init(&match, &key, NULL); - opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); + opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; @@ -2380,10 +2426,11 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->key, - tun_info->options_len ? + err = ip_tun_to_nlattr(skb, &tun_info->key, + tun_info->options_len ? ip_tunnel_info_opts(tun_info) : NULL, - tun_info->options_len); + tun_info->options_len, + ip_tunnel_info_af(tun_info)); if (err) return err; nla_nest_end(skb, start); -- cgit v1.1 From 4148987a5111b0c8062bd78f39a67c361f621a39 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 08:32:51 -0600 Subject: net: Fix vti use case with oif in dst lookups for IPv6 It occurred to me yesterday that 741a11d9e4103 ("net: ipv6: Add RT6_LOOKUP_F_IFACE flag if oif is set") means that xfrm6_dst_lookup needs the FLOWI_FLAG_SKIP_NH_OIF flag set. This latest commit causes the oif to be considered in lookups which is known to break vti. This explains why 58189ca7b274 did not the IPv6 change at the time it was submitted. Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/xfrm6_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 69cee4e..08c9c93 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -37,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; + fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); -- cgit v1.1 From 6e2895a8e3824eb5611c97a015a3b6d678b4503e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 08:51:23 -0700 Subject: net: Rename FLOWI_FLAG_VRFSRC to FLOWI_FLAG_L3MDEV_SRC Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 156ba75..b2882cf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (netif_index_is_l3_master(net, ipc.oif)) { flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC | + (flow_flags | FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF), faddr, saddr, dport, inet->inet_sport); -- cgit v1.1 From fee6d4c777a125e56de9370db3b2bf359bf958d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 08:51:24 -0700 Subject: net: Add netif_is_l3_slave IPv6 addrconf keys off of IFF_SLAVE so can not use it for L3 slave. Add a new private flag and add netif_is_l3_slave function for checking it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/l3mdev/l3mdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index ddf75ad..8e5ead3 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -26,11 +26,11 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev) if (netif_is_l3_master(dev)) { ifindex = dev->ifindex; - } else if (dev->flags & IFF_SLAVE) { + } else if (netif_is_l3_slave(dev)) { struct net_device *master; master = netdev_master_upper_dev_get_rcu(dev); - if (master && netif_is_l3_master(master)) + if (master) ifindex = master->ifindex; } @@ -54,7 +54,7 @@ u32 l3mdev_fib_table_rcu(const struct net_device *dev) if (netif_is_l3_master(dev)) { if (dev->l3mdev_ops->l3mdev_fib_table) tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev); - } else if (dev->flags & IFF_SLAVE) { + } else if (netif_is_l3_slave(dev)) { /* Users of netdev_master_upper_dev_get_rcu need non-const, * but current inet_*type functions take a const */ @@ -62,7 +62,7 @@ u32 l3mdev_fib_table_rcu(const struct net_device *dev) const struct net_device *master; master = netdev_master_upper_dev_get_rcu(_dev); - if (master && netif_is_l3_master(master) && + if (master && master->l3mdev_ops->l3mdev_fib_table) tb_id = master->l3mdev_ops->l3mdev_fib_table(master); } -- cgit v1.1 From 3ce58d84358c7b477811b5100152fad848f936fc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 08:51:25 -0700 Subject: net: Refactor path selection in __ip_route_output_key_hash VRF device needs the same path selection following lookup to set source address. Rather than duplicating code, move existing code into a function that is exported to modules. Code move only; no functional change. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 21 +++++++++++++++++++++ net/ipv4/route.c | 16 +--------------- 2 files changed, 22 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7bd698c..af77298 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1557,3 +1557,24 @@ void fib_select_multipath(struct fib_result *res, int hash) res->nh_sel = 0; } #endif + +void fib_select_path(struct net *net, struct fib_result *res, + struct flowi4 *fl4, int mp_hash) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (mp_hash < 0) + mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr); + fib_select_multipath(res, mp_hash); + } + else +#endif + if (!res->prefixlen && + res->table->tb_num_default > 1 && + res->type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(fl4, res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, *res); +} +EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 54297d3..54e6f45 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2238,21 +2238,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, goto make_route; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { - if (mp_hash < 0) - mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr); - fib_select_multipath(&res, mp_hash); - } - else -#endif - if (!res.prefixlen && - res.table->tb_num_default > 1 && - res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(fl4, &res); - - if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, res); + fib_select_path(net, &res, fl4, mp_hash); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; -- cgit v1.1 From 8cbb512c923d5f695ff6265b2b741b1718e3b444 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 08:51:26 -0700 Subject: net: Add source address lookup op for VRF Add operation to l3mdev to lookup source address for a given flow. Add support for the operation to VRF driver and convert existing IPv4 hooks to use the new lookup. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b2882cf..e1fc129 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1017,30 +1017,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - /* unconnected socket. If output device is enslaved to a VRF - * device lookup source address from VRF table. This mimics - * behavior of ip_route_connect{_init}. - */ - if (netif_index_is_l3_master(net, ipc.oif)) { - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, - RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_L3MDEV_SRC | - FLOWI_FLAG_SKIP_NH_OIF), - faddr, saddr, dport, - inet->inet_sport); - - rt = ip_route_output_flow(net, fl4, sk); - if (!IS_ERR(rt)) { - saddr = fl4->saddr; - ip_rt_put(rt); - } - } - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, fl4); + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { -- cgit v1.1 From bb191c3e874650ae8f701885f3dd5f8ea8989b19 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 08:51:27 -0700 Subject: net: Add l3mdev saddr lookup to raw_sendmsg ping originated on box through a VRF device is showing up in tcpdump without a source address: $ tcpdump -n -i vrf-blue 08:58:33.311303 IP 0.0.0.0 > 10.2.2.254: ICMP echo request, id 2834, seq 1, length 64 08:58:33.311562 IP 10.2.2.254 > 10.2.2.2: ICMP echo reply, id 2834, seq 1, length 64 Add the call to l3mdev_get_saddr to raw_sendmsg. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/raw.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 28ef8a9..09a07e8 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -484,6 +484,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; @@ -543,7 +544,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(net, msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -598,6 +599,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, &fl4); + if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -608,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; -- cgit v1.1 From deaa0a6a930edc79081268bf23b196d0340499af Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 5 Oct 2015 10:49:04 -0700 Subject: net: Lookup actual route when oif is VRF device If the user specifies a VRF device in a get route query the custom route pointing to the VRF device is returned: $ ip route ls table vrf-red unreachable default broadcast 10.2.1.0 dev eth1 proto kernel scope link src 10.2.1.2 10.2.1.0/24 dev eth1 proto kernel scope link src 10.2.1.2 local 10.2.1.2 dev eth1 proto kernel scope host src 10.2.1.2 broadcast 10.2.1.255 dev eth1 proto kernel scope link src 10.2.1.2 $ ip route get oif vrf-red 10.2.1.40 10.2.1.40 dev vrf-red cache Add the flags to skip the custom route and go directly to the FIB. With this patch the actual route is returned: $ ip route get oif vrf-red 10.2.1.40 10.2.1.40 dev eth1 src 10.2.1.2 cache Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 54e6f45..bf1486b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2507,6 +2507,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + if (netif_index_is_l3_master(net, fl4.flowi4_oif)) + fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; + if (iif) { struct net_device *dev; -- cgit v1.1 From 4ebc7660ab4559cad10b6595e05f70562bb26dc5 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:11:55 +0200 Subject: bridge: netlink: export port's root id Add IFLA_BRPORT_ROOT_ID to allow getting the designated root id via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 330abf4..cad4050 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -127,6 +127,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + 0; } @@ -160,7 +161,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, - !!(p->flags & BR_PROXYARP_WIFI))) + !!(p->flags & BR_PROXYARP_WIFI)) || + nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), + &p->designated_root)) return -EMSGSIZE; return 0; -- cgit v1.1 From 80df9a2692edf7afffda9282e716e7b1df198e07 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:11:56 +0200 Subject: bridge: netlink: export port's bridge id Add IFLA_BRPORT_BRIDGE_ID to allow getting the designated bridge id via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index cad4050..c3e0b73 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -128,6 +128,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + 0; } @@ -163,7 +164,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), - &p->designated_root)) + &p->designated_root) || + nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), + &p->designated_bridge)) return -EMSGSIZE; return 0; -- cgit v1.1 From 96f94e7f4a216282a24819968184c881e6343692 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:11:57 +0200 Subject: bridge: netlink: export port's designated cost and port Add IFLA_BRPORT_DESIGNATED_(COST|PORT) to allow getting the port's designated cost and port respectively via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c3e0b73..678d227 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -129,6 +129,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + 0; } @@ -166,7 +168,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), &p->designated_root) || nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), - &p->designated_bridge)) + &p->designated_bridge) || + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost)) return -EMSGSIZE; return 0; -- cgit v1.1 From 42d452c4b5e7bf0e3024fa9512ec462f70545ae5 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:11:58 +0200 Subject: bridge: netlink: export port's id and number Add IFLA_BRPORT_(ID|NO) to allow getting port's port_id and port_no respectively via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 678d227..e513327 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -131,6 +131,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + 0; } @@ -170,7 +172,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), &p->designated_bridge) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || - nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost)) + nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || + nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || + nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no)) return -EMSGSIZE; return 0; -- cgit v1.1 From e08e838ac5707cb1f1294e0d53b31997a0367b99 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:11:59 +0200 Subject: bridge: netlink: export port's topology_change_ack and config_pending Add IFLA_BRPORT_TOPOLOGY_CHANGE_ACK and IFLA_BRPORT_CONFIG_PENDING to allow getting port's topology_change_ack and config_pending respectively via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e513327..433d632 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -133,6 +133,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + 0; } @@ -174,7 +176,10 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || - nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no)) + nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || + nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + p->topology_change_ack) || + nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) return -EMSGSIZE; return 0; -- cgit v1.1 From 61c0a9a83e0b12c712cd686172446aba8ea48685 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:12:00 +0200 Subject: bridge: netlink: export port's timer values Add the following attributes in order to export port's timer values: IFLA_BRPORT_MESSAGE_AGE_TIMER, IFLA_BRPORT_FORWARD_DELAY_TIMER and IFLA_BRPORT_HOLD_TIMER. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 433d632..04b0e50 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -135,6 +135,9 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ + 0; } @@ -156,6 +159,7 @@ static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); + u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || @@ -182,6 +186,16 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) return -EMSGSIZE; + timerval = br_timer_value(&p->message_age_timer); + if (nla_put_u64(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval)) + return -EMSGSIZE; + timerval = br_timer_value(&p->forward_delay_timer); + if (nla_put_u64(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval)) + return -EMSGSIZE; + timerval = br_timer_value(&p->hold_timer); + if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval)) + return -EMSGSIZE; + return 0; } -- cgit v1.1 From 9b0c6e4deb3df91bf0aea8158ea77dc58c9d90b6 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:12:01 +0200 Subject: bridge: netlink: allow to flush port's fdb Add IFLA_BRPORT_FLUSH to allow flushing port's fdb similar to sysfs's flush. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 04b0e50..6468166 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -631,6 +631,9 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } + if (tb[IFLA_BRPORT_FLUSH]) + br_fdb_delete_by_port(p->br, p, 0, 0); + br_port_flags_change(p, old_flags ^ p->flags); return 0; } -- cgit v1.1 From 5d6ae479ab7ddf77bb22bdf739268581453ff886 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 6 Oct 2015 14:12:02 +0200 Subject: bridge: netlink: add support for port's multicast_router attribute Add IFLA_BRPORT_MULTICAST_ROUTER to allow setting/getting port's multicast_router via netlink. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6468166..d78b442 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -138,6 +138,9 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ +#endif + 0; } @@ -196,6 +199,12 @@ static int br_port_fill_attrs(struct sk_buff *skb, if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval)) return -EMSGSIZE; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, + p->multicast_router)) + return -EMSGSIZE; +#endif + return 0; } @@ -560,6 +569,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, + [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -634,6 +644,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (tb[IFLA_BRPORT_FLUSH]) br_fdb_delete_by_port(p->br, p, 0, 0); +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { + u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); + + err = br_multicast_set_port_router(p, mcast_router); + if (err) + return err; + } +#endif br_port_flags_change(p, old_flags ^ p->flags); return 0; } -- cgit v1.1 From 686a562449af96a0e8c18c6f1b87b47ff8c36de8 Mon Sep 17 00:00:00 2001 From: Yuvaraja Mariappan Date: Tue, 6 Oct 2015 10:53:29 -0700 Subject: net: ipv4: tcp.c Fixed an assignment coding style issue Fixed an assignment coding style issue Signed-off-by: Yuvaraja Mariappan Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3c96fa8..ac1bdbb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -900,7 +900,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto out_err; } @@ -967,7 +968,8 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) { + size -= copy; + if (!size) { tcp_tx_timestamp(sk, skb); goto out; } @@ -988,7 +990,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1111,7 +1114,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } @@ -1267,7 +1271,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1767,7 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* __ Restore normal policy in scheduler __ */ - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; @@ -1778,7 +1784,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, do_prequeue: tcp_prequeue_process(sk); - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; @@ -2230,7 +2237,8 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; -- cgit v1.1 From acb4a6bfc80ddeea4c44074dd630f916259e909e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Oct 2015 14:49:58 -0700 Subject: tcp: ensure prior synack rtx behavior with small backlogs Some applications use a listen() backlog of 1. Prior kernels were silently enforcing a qlen_log of 4, so that we were sending up to /proc/sys/net/ipv4/tcp_synack_retries SYNACK messages. Fixes: ef547f2ac16b ("tcp: remove max_qlen_log") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 89eedfb..514b9e9 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -579,7 +579,7 @@ static void reqsk_timer_handler(unsigned long data) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if ((qlen << 1) > sk_listener->sk_max_ack_backlog) { + if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { -- cgit v1.1 From 6c566dd5a1253f73458ce6ba6cf3830e9d38c132 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 7 Oct 2015 15:32:13 +0200 Subject: Bluetooth: Send index information updates to monitor channel The Bluetooth public device address might change during controller setup and it makes it a lot simpler for monitoring tools if they just get told what the new address is. In addition include the manufacturer / company information of the controller. That allows for easy vendor specific HCI command and event handling. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 64ebe84..9bf30db 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -303,6 +303,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; struct hci_mon_new_index *ni; + struct hci_mon_index_info *ii; struct sk_buff *skb; __le16 opcode; @@ -312,7 +313,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) if (!skb) return NULL; - ni = (void *) skb_put(skb, HCI_MON_NEW_INDEX_SIZE); + ni = (void *)skb_put(skb, HCI_MON_NEW_INDEX_SIZE); ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); @@ -329,6 +330,18 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; + case HCI_DEV_UP: + skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); + if (!skb) + return NULL; + + ii = (void *)skb_put(skb, HCI_MON_INDEX_INFO_SIZE); + bacpy(&ii->bdaddr, &hdev->bdaddr); + ii->manufacturer = cpu_to_le16(hdev->manufacturer); + + opcode = cpu_to_le16(HCI_MON_INDEX_INFO); + break; + case HCI_DEV_OPEN: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) @@ -384,6 +397,16 @@ static void send_monitor_replay(struct sock *sk) if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); + + if (!test_bit(HCI_UP, &hdev->flags)) + continue; + + skb = create_monitor_event(hdev, HCI_DEV_UP); + if (!skb) + continue; + + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); } read_unlock(&hci_dev_list_lock); -- cgit v1.1 From e875ff84079b9e7d3ce24b97e3396230d41044d4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 7 Oct 2015 16:38:35 +0200 Subject: Bluetooth: Add support for vendor specific diagnostic channel Introduce hci_recv_diag function for HCI drivers to allow sending vendor specific diagnostic messages into the Bluetooth core stack. The messages are not processed, but they are forwarded to the monitor channel and can be retrieved by user space diagnostic tools. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 15 +++++++++++++++ net/bluetooth/hci_sock.c | 3 +++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 40a6701..8193845 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3493,6 +3493,21 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_frame); +/* Receive diagnostic message from HCI drivers */ +int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) +{ + /* Time stamp */ + __net_timestamp(skb); + + /* Mark as diagnostic packet and send to monitor */ + bt_cb(skb)->pkt_type = HCI_DIAG_PKT; + hci_send_to_monitor(hdev, skb); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(hci_recv_diag); + /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 9bf30db..9a100c1 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -279,6 +279,9 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; + case HCI_DIAG_PKT: + opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); + break; default: return; } -- cgit v1.1 From 4b4113d6dbdbdac095743c05f694af9b7cdc9a44 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 7 Oct 2015 19:52:35 +0200 Subject: Bluetooth: Add debugfs entry for setting vendor diagnostic mode This adds a new debugfs entry for enabling and disabling the vendor diagnostic mode. It is only exposed for drivers that provide the set_diag driver callback and actually have an option for vendor specific diagnostic information. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8193845..e75bc54 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -134,6 +134,56 @@ static const struct file_operations dut_mode_fops = { .llseek = default_llseek, }; +static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y': 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf)-1)); + bool enable; + int err; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + hci_req_lock(hdev); + err = hdev->set_diag(hdev, enable); + hci_req_unlock(hdev); + + if (err < 0) + return err; + + if (enable) + hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); + else + hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); + + return count; +} + +static const struct file_operations vendor_diag_fops = { + .open = simple_open, + .read = vendor_diag_read, + .write = vendor_diag_write, + .llseek = default_llseek, +}; + /* ---- HCI requests ---- */ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, @@ -850,12 +900,19 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - /* The Device Under Test (DUT) mode is special and available for - * all controller types. So just create it early on. - */ if (hci_dev_test_flag(hdev, HCI_SETUP)) { + /* The Device Under Test (DUT) mode is special and available + * for all controller types. So just create it early on. + */ debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, &dut_mode_fops); + + /* When the driver supports the set_diag callback, then + * expose an entry to modify the vendor diagnostic setting. + */ + if (hdev->set_diag) + debugfs_create_file("vendor_diag", 0644, hdev->debugfs, + hdev, &vendor_diag_fops); } err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); -- cgit v1.1 From acc649c6540ef224cc07d17c4b632da9dedfb6a2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 8 Oct 2015 01:53:55 +0200 Subject: Bluetooth: Fix interaction of HCI_QUIRK_RESET_ON_CLOSE and HCI_AUTO_OFF When the controller requires the HCI Reset command to be send when closing the transport, the HCI_AUTO_OFF needs to be accounted for. The current code tries to actually do that, but the flag gets cleared to early. So store its value and use it that stored value instead of checking for a flag that is always cleared. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e75bc54..43a1f2d 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1614,6 +1614,8 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) int hci_dev_do_close(struct hci_dev *hdev) { + bool auto_off; + BT_DBG("%s %p", hdev->name, hdev); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && @@ -1669,10 +1671,10 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - if (!hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { - if (hdev->dev_type == HCI_BREDR) - mgmt_powered(hdev, 0); - } + auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); + + if (!auto_off && hdev->dev_type == HCI_BREDR) + mgmt_powered(hdev, 0); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); @@ -1689,9 +1691,8 @@ int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && - !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && - test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && + !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); clear_bit(HCI_INIT, &hdev->flags); -- cgit v1.1 From fe806dceded462f7930f8ac4a41c5d19819e70b7 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 8 Oct 2015 03:14:28 +0200 Subject: Bluetooth: Enforce packet types in hci_recv_frame driver function When calling the hci_recv_frame driver function check for valid packet types that the core should process. This should catch issues with drivers trying to feed vendor packet types through this interface. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 43a1f2d..b2095ca 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3538,6 +3538,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) return -ENXIO; } + if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT && + bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && + bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + kfree_skb(skb); + return -EINVAL; + } + /* Incoming skb */ bt_cb(skb)->incoming = 1; -- cgit v1.1 From 301de2cb6a521405cde1a2f9cdc42c5257b5725b Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 6 Oct 2015 13:03:19 +0300 Subject: Bluetooth: 6lowpan: Fix imtu & omtu values The omtu value is determined by the remote peer so there's no point in trying to hard-code it to any value. The IPSP specification otoh gives a more reasonable value for the imtu, i.e. 1280. Signed-off-by: Johan Hedberg Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 131e79c..3e20f7a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -775,8 +775,7 @@ static struct l2cap_chan *chan_create(void) chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; - chan->omtu = 65535; - chan->imtu = chan->omtu; + chan->imtu = 1280; return chan; } -- cgit v1.1 From 5d0fd77a043504dabccb66d9b5671e682868e96d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 6 Oct 2015 13:03:20 +0300 Subject: Bluetooth: 6lowpan: Remove redundant (and incorrect) MPS assignments The L2CAP core code already sets the local MPS to a sane value. The remote MPS value otoh comes from the remote side so there's no point in trying to hard-code it to any value. Signed-off-by: Johan Hedberg Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 3e20f7a..3d951ab 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -788,9 +788,6 @@ static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) if (!chan) return NULL; - chan->remote_mps = chan->omtu; - chan->mps = chan->omtu; - chan->state = BT_CONNECTED; return chan; -- cgit v1.1 From b0c09f94ff1660a1873549b788c998284ea5fb8a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 6 Oct 2015 13:03:21 +0300 Subject: Bluetooth: 6lowpan: Remove redundant BT_CONNECTED assignment The L2CAP core code makes sure of setting the channel state to BT_CONNECTED, so there's no need for the implementation code (6lowpan in this case) to do it. Signed-off-by: Johan Hedberg Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 3d951ab..023fa29 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -788,8 +788,6 @@ static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) if (!chan) return NULL; - chan->state = BT_CONNECTED; - return chan; } -- cgit v1.1 From 630ef791ea8e4274f20b833e1977cb1b0462d3ec Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 6 Oct 2015 13:03:22 +0300 Subject: Bluetooth: 6lowpan: Remove unnecessary chan_open() function All the chan_open() function now does is to call chan_create() so it doesn't really add any value. Signed-off-by: Johan Hedberg Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 023fa29..77eb698 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -780,17 +780,6 @@ static struct l2cap_chan *chan_create(void) return chan; } -static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) -{ - struct l2cap_chan *chan; - - chan = chan_create(); - if (!chan) - return NULL; - - return chan; -} - static void set_ip_addr_bits(u8 addr_type, u8 *addr) { if (addr_type == BDADDR_LE_PUBLIC) @@ -913,7 +902,10 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; - chan = chan_open(pchan); + chan = chan_create(); + if (!chan) + return NULL; + chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); -- cgit v1.1 From 0cd088fc97bbe4834e9bc9727012ecac49386849 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 6 Oct 2015 13:03:23 +0300 Subject: Bluetooth: 6lowpan: Rename confusing 'pchan' variables The typical convention when having both a child and a parent channel variable is to call the former 'chan' and the latter 'pchan'. When there's only one variable it's called chan. Rename the 'pchan' variables in the 6lowpan code to follow this convention. Signed-off-by: Johan Hedberg Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 77eb698..e20b972 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -1053,32 +1053,32 @@ static inline __u8 bdaddr_type(__u8 type) static struct l2cap_chan *chan_get(void) { - struct l2cap_chan *pchan; + struct l2cap_chan *chan; - pchan = chan_create(); - if (!pchan) + chan = chan_create(); + if (!chan) return NULL; - pchan->ops = &bt_6lowpan_chan_ops; + chan->ops = &bt_6lowpan_chan_ops; - return pchan; + return chan; } static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { - struct l2cap_chan *pchan; + struct l2cap_chan *chan; int err; - pchan = chan_get(); - if (!pchan) + chan = chan_get(); + if (!chan) return -EINVAL; - err = l2cap_chan_connect(pchan, cpu_to_le16(L2CAP_PSM_IPSP), 0, + err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type); - BT_DBG("chan %p err %d", pchan, err); + BT_DBG("chan %p err %d", chan, err); if (err < 0) - l2cap_chan_put(pchan); + l2cap_chan_put(chan); return err; } @@ -1103,31 +1103,31 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; - struct l2cap_chan *pchan; + struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; - pchan = chan_get(); - if (!pchan) + chan = chan_get(); + if (!chan) return NULL; - pchan->state = BT_LISTEN; - pchan->src_type = BDADDR_LE_PUBLIC; + chan->state = BT_LISTEN; + chan->src_type = BDADDR_LE_PUBLIC; - atomic_set(&pchan->nesting, L2CAP_NESTING_PARENT); + atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); - BT_DBG("chan %p src type %d", pchan, pchan->src_type); + BT_DBG("chan %p src type %d", chan, chan->src_type); - err = l2cap_add_psm(pchan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); + err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { - l2cap_chan_put(pchan); + l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } - return pchan; + return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, -- cgit v1.1 From 26d46dffbe2cd0a023aa6192708f80cd796af107 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 6 Oct 2015 13:03:24 +0300 Subject: Bluetooth: 6lowpan: Remove unnecessary chan_get() function The chan_get() function just adds unnecessary indirection to calling the chan_create() call. The only added value it gives is the chan->ops assignment, but that can equally well be done in the calling code. Signed-off-by: Johan Hedberg Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index e20b972..9363f05 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -1051,28 +1051,17 @@ static inline __u8 bdaddr_type(__u8 type) return BDADDR_LE_RANDOM; } -static struct l2cap_chan *chan_get(void) -{ - struct l2cap_chan *chan; - - chan = chan_create(); - if (!chan) - return NULL; - - chan->ops = &bt_6lowpan_chan_ops; - - return chan; -} - static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; int err; - chan = chan_get(); + chan = chan_create(); if (!chan) return -EINVAL; + chan->ops = &bt_6lowpan_chan_ops; + err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type); @@ -1109,10 +1098,11 @@ static struct l2cap_chan *bt_6lowpan_listen(void) if (!enable_6lowpan) return NULL; - chan = chan_get(); + chan = chan_create(); if (!chan) return NULL; + chan->ops = &bt_6lowpan_chan_ops; chan->state = BT_LISTEN; chan->src_type = BDADDR_LE_PUBLIC; -- cgit v1.1 From fd2874b3bbe832e90ac480971a7a8bd736b629b9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:32 -0500 Subject: ipv4: Fix ip_local_out_sk by passing the sk into __ip_local_out_sk In the rare case where sk != skb->sk ip_local_out_sk arranges to call dst->output differently if the skb is queued or not. This is a bug. Fix this bug by passing the sk parameter of ip_local_out_sk through from ip_local_out_sk to __ip_local_out_sk (skipping __ip_local_out). Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().") Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 911ea73..6cb585a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -117,7 +117,7 @@ int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; - err = __ip_local_out(skb); + err = __ip_local_out_sk(sk, skb); if (likely(err == 1)) err = dst_output(sk, skb); -- cgit v1.1 From 850dcc4d4dd7d5da5c1b2a780c5e649c3b649545 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:33 -0500 Subject: ipv4: Fix ip_queue_xmit to pass sk into ip_local_out_sk After a packet has been encapsulated by a tunnel we should use the tunnel sockets local multicast loopback flag to control if the encapsulated packet should be locally loopback back. Pass sk into ip_local_out_sk so that in the rare case we are dealing with a tunneled packet whose tunnel destination address is a multicast address the kernel properly decides to loopback this packet. In practice I don't think this matters as ip_queue_xmit is used by tcp, l2tp and sctp none of which I am aware of uses ip level multicasting as they are all point to point communications protocols. Let's fix this before someone uses ip_queue_xmit for a tunnel protocol that does use multicast. Fixes: aad88724c9d5 ("ipv4: add a sock pointer to dst->output() path.") Fixes: b0270e91014d ("ipv4: add a sock pointer to ip_queue_xmit()") Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6cb585a..1030f48 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -460,7 +460,7 @@ packet_routed: skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(skb); + res = ip_local_out_sk(sk, skb); rcu_read_unlock(); return res; -- cgit v1.1 From 3f5312ae620c79e877a6aa0e8894c6603a090b4e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:34 -0500 Subject: xfrm: Only compute net once in xfrm_policy_queue_process Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 418daa0..be1776b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1887,6 +1887,7 @@ static void xfrm_policy_queue_process(unsigned long arg) struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = (struct xfrm_policy *)arg; + struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; @@ -1903,8 +1904,7 @@ static void xfrm_policy_queue_process(unsigned long arg) spin_unlock(&pq->hold_queue.lock); dst_hold(dst->path); - dst = xfrm_lookup(xp_net(pol), dst->path, &fl, - sk, 0); + dst = xfrm_lookup(net, dst->path, &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue; @@ -1934,8 +1934,7 @@ static void xfrm_policy_queue_process(unsigned long arg) xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path, - &fl, skb->sk, 0); + dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; -- cgit v1.1 From 13206b6bff3b15b724926a222406476bf2c23c40 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:35 -0500 Subject: net: Pass net into dst_output and remove dst_output_okfn Replace dst_output_okfn with dst_output Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/decnet/dn_nsp_out.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 7 ++++--- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/xfrm4_output.c | 2 +- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/ip6_vti.c | 2 +- net/ipv6/ip6mr.c | 2 +- net/ipv6/mcast.c | 4 ++-- net/ipv6/ndisc.c | 2 +- net/ipv6/output_core.c | 5 +++-- net/ipv6/raw.c | 2 +- net/ipv6/xfrm6_output.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- net/xfrm/xfrm_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 18 files changed, 27 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 4b02dd3..849805e 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -85,7 +85,7 @@ static void dn_nsp_send(struct sk_buff *skb) if (dst) { try_again: skb_dst_set(skb, dst); - dst_output(skb->sk, skb); + dst_output(&init_net, skb->sk, skb); return; } @@ -582,7 +582,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, * associations. */ skb_dst_set(skb, dst_clone(dst)); - dst_output(skb->sk, skb); + dst_output(&init_net, skb->sk, skb); } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index d66cfb3..da0d7ce 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -72,7 +72,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s ip_forward_options(skb); skb_sender_cpu_clear(skb); - return dst_output(sk, skb); + return dst_output(net, sk, skb); } int ip_forward(struct sk_buff *skb) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1030f48..c94efb2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -105,7 +105,7 @@ static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) ip_send_check(iph); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, - dst_output_okfn); + dst_output); } int __ip_local_out(struct sk_buff *skb) @@ -115,11 +115,12 @@ int __ip_local_out(struct sk_buff *skb) int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { + struct net *net = dev_net(skb_dst(skb)->dev); int err; err = __ip_local_out_sk(sk, skb); if (likely(err == 1)) - err = dst_output(sk, skb); + err = dst_output(net, sk, skb); return err; } @@ -276,7 +277,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(sk, skb); + return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(skb); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 3b87ec5..4d8f0b6 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; - err = dst_output(skb->sk, skb); + err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = skb->len; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index cfcb996..fc42525 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1689,7 +1689,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output(sk, skb); + return dst_output(net, sk, skb); } /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 09a07e8..8c0d0bd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -413,7 +413,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, - dst_output_okfn); + dst_output); if (err > 0) err = net_xmit_errno(err); if (err) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index cd6be73..17db61f 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -87,7 +87,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(sk, skb); + return dst_output(net, sk, skb); } #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index caf7d14..0171e76 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -233,7 +233,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, (struct sock *)sk, skb, NULL, dst->dev, - dst_output_okfn); + dst_output); } skb->dev = dst->dev; @@ -333,7 +333,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_sender_cpu_clear(skb); - return dst_output(sk, skb); + return dst_output(net, sk, skb); } static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index f96f1c1..0a8610b 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -482,7 +482,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) return -EMSGSIZE; } - err = dst_output(skb->sk, skb); + err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 5e5d16e..ad19136 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1991,7 +1991,7 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct IPSTATS_MIB_OUTFORWDATAGRAMS); IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTOCTETS, skb->len); - return dst_output(sk, skb); + return dst_output(net, sk, skb); } /* diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8bf57c..124338a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1646,7 +1646,7 @@ static void mld_sendpack(struct sk_buff *skb) err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, net->ipv6.igmp_sk, skb, NULL, skb->dev, - dst_output_okfn); + dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); @@ -2010,7 +2010,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) skb_dst_set(skb, dst); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb->dev, - dst_output_okfn); + dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7089c30..b18012f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -465,7 +465,7 @@ static void ndisc_send_skb(struct sk_buff *skb, err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, - dst_output_okfn); + dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index e77102c..4337147 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -151,7 +151,7 @@ static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, - dst_output_okfn); + dst_output); } int __ip6_local_out(struct sk_buff *skb) @@ -162,11 +162,12 @@ EXPORT_SYMBOL_GPL(__ip6_local_out); int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) { + struct net *net = dev_net(skb_dst(skb)->dev); int err; err = __ip6_local_out_sk(sk, skb); if (likely(err == 1)) - err = dst_output(sk, skb); + err = dst_output(net, sk, skb); return err; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fec0151..dc65ec1 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -655,7 +655,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, - NULL, rt->dst.dev, dst_output_okfn); + NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 4cefda0..c9a5bd5 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -147,7 +147,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_NETFILTER if (!x) { IP6CB(skb)->flags |= IP6SKB_REROUTED; - return dst_output(sk, skb); + return dst_output(net, sk, skb); } #endif diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 77182b9..504d1fc 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -576,7 +576,7 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!skb->sk) skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_okfn); + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; @@ -598,7 +598,7 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!skb->sk) skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_okfn); + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; return ret; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index c48a4b8..88752b0 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -141,7 +141,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) goto out; if (!skb_dst(skb)->xfrm) - return dst_output(skb->sk, skb); + return dst_output(net, skb->sk, skb); err = nf_hook(skb_dst(skb)->ops->family, NF_INET_POST_ROUTING, net, skb->sk, skb, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index be1776b..f4f2d98 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1944,7 +1944,7 @@ static void xfrm_policy_queue_process(unsigned long arg) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(skb->sk, skb); + dst_output(net, skb->sk, skb); } out: -- cgit v1.1 From 4ebdfba73c09d8568d891bae87c40fad43dd7f41 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:36 -0500 Subject: dst: Pass a sk into .local_out For consistency with the other similar methods in the kernel pass a struct sock into the dst_ops .local_out method. Simplifying the socket passing case is needed a prequel to passing a struct net reference into .local_out. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/output_core.c | 2 +- net/ipv6/route.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- net/xfrm/xfrm_output.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c94efb2..c38dfd7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -96,7 +96,7 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); struct iphdr *iph = ip_hdr(skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bf1486b..638b976 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -165,7 +165,7 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, - .local_out = __ip_local_out, + .local_out = __ip_local_out_sk, .neigh_lookup = ipv4_neigh_lookup, }; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index f2606b9..d46d99f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -243,7 +243,7 @@ static struct dst_ops xfrm4_dst_ops = { .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, - .local_out = __ip_local_out, + .local_out = __ip_local_out_sk, .gc_thresh = 32768, }; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 4337147..e5affb5 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -138,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) EXPORT_SYMBOL(ip6_dst_hoplimit); #endif -static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); int len; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d3d9467..b62a507 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -226,7 +226,7 @@ static struct dst_ops ip6_dst_ops_template = { .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, - .local_out = __ip6_local_out, + .local_out = __ip6_local_out_sk, .neigh_lookup = ip6_neigh_lookup, }; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 08c9c93..f787683 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -285,7 +285,7 @@ static struct dst_ops xfrm6_dst_ops = { .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, - .local_out = __ip6_local_out, + .local_out = __ip6_local_out_sk, .gc_thresh = 32768, }; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 88752b0..a7a254f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -136,7 +136,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) while (likely((err = xfrm_output_one(skb, err)) == 0)) { nf_reset(skb); - err = skb_dst(skb)->ops->local_out(skb); + err = skb_dst(skb)->ops->local_out(skb->sk, skb); if (unlikely(err != 1)) goto out; -- cgit v1.1 From b92dacd45698e120104ff81066ceb534916090d9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:37 -0500 Subject: ipv4: Merge __ip_local_out and __ip_local_out_sk Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 9 ++------- net/ipv4/route.c | 2 +- net/ipv4/xfrm4_policy.c | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c38dfd7..66c627b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -96,7 +96,7 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); struct iphdr *iph = ip_hdr(skb); @@ -108,17 +108,12 @@ int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) dst_output); } -int __ip_local_out(struct sk_buff *skb) -{ - return __ip_local_out_sk(skb->sk, skb); -} - int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); int err; - err = __ip_local_out_sk(sk, skb); + err = __ip_local_out(sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 638b976..bf1486b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -165,7 +165,7 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, - .local_out = __ip_local_out_sk, + .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, }; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d46d99f..f2606b9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -243,7 +243,7 @@ static struct dst_ops xfrm4_dst_ops = { .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, - .local_out = __ip_local_out_sk, + .local_out = __ip_local_out, .gc_thresh = 32768, }; -- cgit v1.1 From e2cb77db089796f163092326ca25512845df7a3a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:38 -0500 Subject: ipv4: Merge ip_local_out and ip_local_out_sk It is confusing and silly hiding a parameter so modify all of the callers to pass in the appropriate socket or skb->sk if no socket is known. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 4 ++-- net/ipv4/ip_output.c | 10 +++++----- net/ipv4/ip_tunnel_core.c | 2 +- net/ipv4/netfilter/ipt_SYNPROXY.c | 2 +- net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- net/ipv4/netfilter/nf_reject_ipv4.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index de6d4c8..43375d9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(skb); + return ip_local_out(skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return ip_local_out(skb); + return ip_local_out(skb->sk, skb); } static void igmp_gq_timer_expire(unsigned long data) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 66c627b..10366ee 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -108,7 +108,7 @@ int __ip_local_out(struct sock *sk, struct sk_buff *skb) dst_output); } -int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); int err; @@ -119,7 +119,7 @@ int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) return err; } -EXPORT_SYMBOL_GPL(ip_local_out_sk); +EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -169,7 +169,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb); + return ip_local_out(skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); @@ -456,7 +456,7 @@ packet_routed: skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out_sk(sk, skb); + res = ip_local_out(sk, skb); rcu_read_unlock(); return res; @@ -1436,7 +1436,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb); + err = ip_local_out(skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 84dce6a..8d85ecd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -79,7 +79,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __ip_select_ident(dev_net(rt->dst.dev), iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out_sk(sk, skb); + err = ip_local_out(sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 6a6e762..473faf7 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -63,7 +63,7 @@ synproxy_send_tcp(const struct synproxy_net *snet, nf_conntrack_get(nfct); } - ip_local_out(nskb); + ip_local_out(nskb->sk, nskb); return; free_nskb: diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ce2a59e..0b9abfb 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -92,7 +92,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, if (nf_dup_ipv4_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip_local_out(skb); + ip_local_out(skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 2f5e925..dcc125c 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -157,7 +157,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip_local_out(nskb); + ip_local_out(nskb->sk, nskb); return; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 504d1fc..d77503e 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1049,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip_local_out(skb); + ip_local_out(skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); -- cgit v1.1 From 9f8955cc468ddb7d08a0e614a45f9a82c4019b00 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:39 -0500 Subject: ipv6: Merge __ip6_local_out and __ip6_local_out_sk Only __ip6_local_out_sk has callers so rename __ip6_local_out_sk __ip6_local_out and remove the previous __ip6_local_out. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv6/output_core.c | 9 ++------- net/ipv6/route.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index e5affb5..f93ae15 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -138,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) EXPORT_SYMBOL(ip6_dst_hoplimit); #endif -int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip6_local_out(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); int len; @@ -153,11 +153,6 @@ int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); } - -int __ip6_local_out(struct sk_buff *skb) -{ - return __ip6_local_out_sk(skb->sk, skb); -} EXPORT_SYMBOL_GPL(__ip6_local_out); int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) @@ -165,7 +160,7 @@ int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) struct net *net = dev_net(skb_dst(skb)->dev); int err; - err = __ip6_local_out_sk(sk, skb); + err = __ip6_local_out(sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b62a507..d3d9467 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -226,7 +226,7 @@ static struct dst_ops ip6_dst_ops_template = { .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, - .local_out = __ip6_local_out_sk, + .local_out = __ip6_local_out, .neigh_lookup = ip6_neigh_lookup, }; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index f787683..08c9c93 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -285,7 +285,7 @@ static struct dst_ops xfrm6_dst_ops = { .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, - .local_out = __ip6_local_out_sk, + .local_out = __ip6_local_out, .gc_thresh = 32768, }; -- cgit v1.1 From 792883303cdb3a7edd16017d7aba53926189ef41 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:40 -0500 Subject: ipv6: Merge ip6_local_out and ip6_local_out_sk Stop hidding the sk parameter with an inline helper function and make all of the callers pass it, so that it is clear what the function is doing. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 2 +- net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/netfilter/nf_reject_ipv6.c | 2 +- net/ipv6/output_core.c | 8 +------- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0171e76..31c686b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1692,7 +1692,7 @@ int ip6_send_skb(struct sk_buff *skb) struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); int err; - err = ip6_local_out(skb); + err = ip6_local_out(skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index c235660..c38c341 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -76,7 +76,7 @@ synproxy_send_tcp(const struct synproxy_net *snet, nf_conntrack_get(nfct); } - ip6_local_out(nskb); + ip6_local_out(nskb->sk, nskb); return; free_nskb: diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index ee0d9a5..64f3fe5 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -68,7 +68,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, } if (nf_dup_ipv6_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip6_local_out(skb); + ip6_local_out(skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 94b4c6d..a4f73e2 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -206,7 +206,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(nskb); + ip6_local_out(nskb->sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index f93ae15..1285581 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -155,7 +155,7 @@ int __ip6_local_out(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(__ip6_local_out); -int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip6_local_out(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); int err; @@ -166,10 +166,4 @@ int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) return err; } -EXPORT_SYMBOL_GPL(ip6_local_out_sk); - -int ip6_local_out(struct sk_buff *skb) -{ - return ip6_local_out_sk(skb->sk, skb); -} EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index d77503e..2042b93 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1141,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(skb); + ip6_local_out(skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); -- cgit v1.1 From f859b0f662493e4f53d462f5759e3c4302933077 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:41 -0500 Subject: ipv4: Cache net in iptunnel_xmit Store net in a variable in ip_tunnel_xmit so it does not need to be recomputed when it is used again. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 8d85ecd..caef8e2 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len - skb_inner_network_offset(skb); + struct net *net = dev_net(rt->dst.dev); struct iphdr *iph; int err; @@ -76,8 +77,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(dev_net(rt->dst.dev), iph, - skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out(sk, skb); if (unlikely(net_xmit_eval(err))) -- cgit v1.1 From 77589ce0f84dd99cc946fd71fe6fb44dd8220d0a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:42 -0500 Subject: ipv4: Cache net in ip_build_and_send_pkt and ip_queue_xmit Compute net and store it in a variable in the functions ip_build_and_send_pkt and ip_queue_xmit so that it does not need to be recomputed next time it is needed. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 10366ee..a7012f2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -139,6 +139,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ @@ -157,7 +158,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph->id = 0; } else { iph->frag_off = 0; - __ip_select_ident(sock_net(sk), iph, 1); + __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { @@ -382,6 +383,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; @@ -412,7 +414,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, + rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -449,7 +451,7 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(sock_net(sk), skb, sk, + ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ @@ -462,7 +464,7 @@ packet_routed: no_route: rcu_read_unlock(); - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } -- cgit v1.1 From cf91a99daa4651d0c1f52b8c3d813fd44b43cada Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:45 -0500 Subject: ipv4, ipv6: Pass net into __ip_local_out and __ip6_local_out Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 5 ++--- net/ipv6/output_core.c | 5 ++--- net/xfrm/xfrm_output.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a7012f2..39d3fbe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -96,9 +96,8 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); @@ -113,7 +112,7 @@ int ip_local_out(struct sock *sk, struct sk_buff *skb) struct net *net = dev_net(skb_dst(skb)->dev); int err; - err = __ip_local_out(sk, skb); + err = __ip_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 1285581..7f64d67 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -138,9 +138,8 @@ int ip6_dst_hoplimit(struct dst_entry *dst) EXPORT_SYMBOL(ip6_dst_hoplimit); #endif -int __ip6_local_out(struct sock *sk, struct sk_buff *skb) +int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); int len; len = skb->len - sizeof(struct ipv6hdr); @@ -160,7 +159,7 @@ int ip6_local_out(struct sock *sk, struct sk_buff *skb) struct net *net = dev_net(skb_dst(skb)->dev); int err; - err = __ip6_local_out(sk, skb); + err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a7a254f..cc3676e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -136,7 +136,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) while (likely((err = xfrm_output_one(skb, err)) == 0)) { nf_reset(skb); - err = skb_dst(skb)->ops->local_out(skb->sk, skb); + err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) goto out; -- cgit v1.1 From 33224b16ffccb49cf798317670389e0bfba0024c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:46 -0500 Subject: ipv4, ipv6: Pass net into ip_local_out and ip6_local_out Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 4 ++-- net/ipv4/ip_output.c | 9 ++++----- net/ipv4/ip_tunnel_core.c | 2 +- net/ipv4/netfilter/ipt_SYNPROXY.c | 2 +- net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- net/ipv4/netfilter/nf_reject_ipv4.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/netfilter/nf_reject_ipv6.c | 2 +- net/ipv6/output_core.c | 3 +-- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- 12 files changed, 17 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 43375d9..64aaf35 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(skb->sk, skb); + return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return ip_local_out(skb->sk, skb); + return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(unsigned long data) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 39d3fbe..9fe100a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -107,9 +107,8 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) dst_output); } -int ip_local_out(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); int err; err = __ip_local_out(net, sk, skb); @@ -169,7 +168,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb->sk, skb); + return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); @@ -457,7 +456,7 @@ packet_routed: skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(sk, skb); + res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; @@ -1437,7 +1436,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb->sk, skb); + err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index caef8e2..6cb9009 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -79,7 +79,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->ttl = ttl; __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out(sk, skb); + err = ip_local_out(net, sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 473faf7..f1a8df8 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -63,7 +63,7 @@ synproxy_send_tcp(const struct synproxy_net *snet, nf_conntrack_get(nfct); } - ip_local_out(nskb->sk, nskb); + ip_local_out(net, nskb->sk, nskb); return; free_nskb: diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 0b9abfb..ceb1873 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -92,7 +92,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, if (nf_dup_ipv4_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip_local_out(skb->sk, skb); + ip_local_out(net, skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index dcc125c..c747b2d 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -157,7 +157,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip_local_out(nskb->sk, nskb); + ip_local_out(net, nskb->sk, nskb); return; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 31c686b..98510fa 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1692,7 +1692,7 @@ int ip6_send_skb(struct sk_buff *skb) struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); int err; - err = ip6_local_out(skb->sk, skb); + err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index c38c341..a10a2a9 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -76,7 +76,7 @@ synproxy_send_tcp(const struct synproxy_net *snet, nf_conntrack_get(nfct); } - ip6_local_out(nskb->sk, nskb); + ip6_local_out(net, nskb->sk, nskb); return; free_nskb: diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 64f3fe5..6989c70 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -68,7 +68,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, } if (nf_dup_ipv6_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip6_local_out(skb->sk, skb); + ip6_local_out(net, skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index a4f73e2..7309e47 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -206,7 +206,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(nskb->sk, nskb); + ip6_local_out(net, nskb->sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 7f64d67..462f2a76b 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -154,9 +154,8 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(__ip6_local_out); -int ip6_local_out(struct sock *sk, struct sk_buff *skb) +int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); int err; err = __ip6_local_out(net, sk, skb); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 2042b93..3264cb49 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1049,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip_local_out(skb->sk, skb); + ip_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1141,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(skb->sk, skb); + ip6_local_out(cp->ipvs->net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); -- cgit v1.1 From ede2059dbaf9c6557a49d466c8c7778343b208ff Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Oct 2015 16:48:47 -0500 Subject: dst: Pass net into dst->output The network namespace is already passed into dst_output pass it into dst->output lwt->output and friends. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dst.c | 14 +++++++------- net/core/lwtunnel.c | 4 ++-- net/decnet/dn_route.c | 6 +++--- net/ipv4/ip_output.c | 6 ++---- net/ipv4/route.c | 4 ++-- net/ipv4/xfrm4_output.c | 4 +--- net/ipv6/ila.c | 4 ++-- net/ipv6/ip6_output.c | 3 +-- net/ipv6/route.c | 14 +++++++------- net/ipv6/xfrm6_output.c | 4 +--- net/mpls/mpls_iptunnel.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 12 files changed, 30 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 0771c8c..2a18180 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -144,12 +144,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard_sk(struct sock *sk, struct sk_buff *skb) +int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard_sk); +EXPORT_SYMBOL(dst_discard_out); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -177,7 +177,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -224,7 +224,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -352,7 +352,7 @@ static struct dst_ops md_dst_ops = { .family = AF_UNSPEC, }; -static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb) +static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { WARN_ONCE(1, "Attempting to call output on metadata dst\n"); kfree_skb(skb); @@ -375,7 +375,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) DST_METADATA | DST_NOCACHE | DST_NOCOUNT); dst->input = dst_md_discard; - dst->output = dst_md_discard_sk; + dst->output = dst_md_discard_out; memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); } @@ -430,7 +430,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst_discard; - dst->output = dst_discard_sk; + dst->output = dst_discard_out; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index dfb1a9c..299cfc2 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -180,7 +180,7 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) } EXPORT_SYMBOL(lwtunnel_cmp_encap); -int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; @@ -199,7 +199,7 @@ int lwtunnel_output(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) - ret = ops->output(sk, skb); + ret = ops->output(net, sk, skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e930321..27fce28 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -744,7 +744,7 @@ out: return NET_RX_DROP; } -static int dn_output(struct sock *sk, struct sk_buff *skb) +static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -832,7 +832,7 @@ drop: * Used to catch bugs. This should never normally get * called. */ -static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb) +static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1469,7 +1469,7 @@ make_route: rt->n = neigh; rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug_sk; + rt->dst.output = dn_rt_bug_out; switch (res.type) { case RTN_UNICAST: rt->dst.input = dn_forward; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9fe100a..67404e1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -284,11 +284,10 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } -int ip_mc_output(struct sock *sk, struct sk_buff *skb) +int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; - struct net *net = dev_net(dev); /* * If the indicated interface is up and running, send the packet. @@ -347,10 +346,9 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sock *sk, struct sk_buff *skb) +int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; - struct net *net = dev_net(dev); IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bf1486b..4be5ff08 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1152,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) +static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -2303,7 +2303,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; new->dev = ort->dst.dev; if (new->dev) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 17db61f..9f298d0 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -94,10 +94,8 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm4_output(struct sock *sk, struct sk_buff *skb) +int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, __xfrm4_output, diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index 678d2df..1a6852e 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -91,7 +91,7 @@ static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) *(__be64 *)&ip6h->daddr = p->locator; } -static int ila_output(struct sock *sk, struct sk_buff *skb) +static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -100,7 +100,7 @@ static int ila_output(struct sock *sk, struct sk_buff *skb) update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - return dst->lwtstate->orig_output(sk, skb); + return dst->lwtstate->orig_output(net, sk, skb); drop: kfree_skb(skb); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 98510fa..32583b5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -130,11 +130,10 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } -int ip6_output(struct sock *sk, struct sk_buff *skb) +int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - struct net *net = dev_net(dev); if (unlikely(idev->cnf.disable_ipv6)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d3d9467..4320ddc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -86,9 +86,9 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); @@ -308,7 +308,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard_sk, + .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -1195,7 +1195,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; if (dst_metrics_read_only(&ort->dst)) new->_metrics = ort->dst._metrics; @@ -1853,7 +1853,7 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard_sk; + rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -2446,7 +2446,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) +static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2457,7 +2457,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index c9a5bd5..9db067a 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -173,10 +173,8 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm6_output(struct sock *sk, struct sk_buff *skb) +int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb_dst(skb)->dev); - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, __xfrm6_output, diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 21e70bc..67591ae 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) return en->labels * sizeof(struct mpls_shim_hdr); } -int mpls_output(struct sock *sk, struct sk_buff *skb) +int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f4f2d98..09bfcba 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1957,7 +1957,7 @@ purge_queue: xfrm_pol_put(pol); } -static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) +static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); -- cgit v1.1 From cfc81b50387086c3a1ca6d2be3c46561edfbc3b5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 7 Oct 2015 10:16:09 +0200 Subject: bpf, skb_do_redirect: clear sender_cpu before xmit Similar to commit c29390c6dfee ("xps: must clear sender_cpu before forwarding"), we also need to clear the skb->sender_cpu when moving from RX to TX via skb_do_redirect() due to the shared location of napi_id (used on RX) and sender_cpu (used on TX). Fixes: 27b29f63058d ("bpf: add bpf_redirect() helper") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index da3e535..8f4603c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1462,6 +1462,7 @@ int skb_do_redirect(struct sk_buff *skb) return dev_forward_skb(dev, skb); skb->dev = dev; + skb_sender_cpu_clear(skb); return dev_queue_xmit(skb); } -- cgit v1.1 From 28335a7445202a3d118145a07d9138e9881ebe18 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 7 Oct 2015 08:40:13 -0700 Subject: net: Do not drop to make_route if oif is l3mdev Commit deaa0a6a930 ("net: Lookup actual route when oif is VRF device") exposed a bug in __ip_route_output_key_hash for VRF devices: on FIB lookup failure if the oif is specified the current logic drops to make_route on the assumption that the route tables are wrong. For VRF/L3 master devices this leads to wrong dst entries and route lookups. For example: $ ip route ls table vrf-red unreachable default broadcast 10.2.1.0 dev eth1 proto kernel scope link src 10.2.1.2 10.2.1.0/24 dev eth1 proto kernel scope link src 10.2.1.2 local 10.2.1.2 dev eth1 proto kernel scope host src 10.2.1.2 broadcast 10.2.1.255 dev eth1 proto kernel scope link src 10.2.1.2 $ ip route get oif vrf-red 1.1.1.1 1.1.1.1 dev vrf-red src 10.0.0.2 cache With this patch: $ ip route get oif vrf-red 1.1.1.1 RTNETLINK answers: No route to host which is the correct response based on the default route Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4be5ff08..85f184e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2196,7 +2196,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif) { + if (fl4->flowi4_oif && + !netif_index_is_l3_master(net, fl4->flowi4_oif)) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. -- cgit v1.1 From 4d6a6aed22f91b35c14a6717d42953f260090175 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 2 Oct 2015 20:28:04 +0200 Subject: 6lowpan: move shared settings to lowpan_netdev_setup This patch moves values for all lowpan interface to the shared implementation of 6lowpan. This patch also quietly fixes the forgotten IFF_NO_QUEUE flag for the bluetooth 6LoWPAN interface. An identically commit is 4afbc0d ("net: 6lowpan: convert to using IFF_NO_QUEUE") which wasn't changed for bluetooth 6lowpan. All 6lowpan interfaces should be virtual with IFF_NO_QUEUE, using EUI64 address length, the mtu size is 1280 (IPV6_MIN_MTU) and the netdev type is ARPHRD_6LOWPAN. Signed-off-by: Alexander Aring Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/6lowpan/core.c | 5 +++++ net/bluetooth/6lowpan.c | 6 ------ net/ieee802154/6lowpan/core.c | 4 ---- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index ae1896f..83b19e0 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -17,6 +17,11 @@ void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) { + dev->addr_len = EUI64_ADDR_LEN; + dev->type = ARPHRD_6LOWPAN; + dev->mtu = IPV6_MIN_MTU; + dev->priv_flags |= IFF_NO_QUEUE; + lowpan_priv(dev)->lltype = lltype; } EXPORT_SYMBOL(lowpan_netdev_setup); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 9363f05..db73b8a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -35,7 +35,6 @@ static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" -#define EUI64_ADDR_LEN 8 struct skb_cb { struct in6_addr addr; @@ -674,13 +673,8 @@ static struct header_ops header_ops = { static void netdev_setup(struct net_device *dev) { - dev->addr_len = EUI64_ADDR_LEN; - dev->type = ARPHRD_6LOWPAN; - dev->hard_header_len = 0; dev->needed_tailroom = 0; - dev->mtu = IPV6_MIN_MTU; - dev->tx_queue_len = 0; dev->flags = IFF_RUNNING | IFF_POINTOPOINT | IFF_MULTICAST; dev->watchdog_timeo = 0; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 44420ed..20c49c7 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -101,13 +101,9 @@ static const struct net_device_ops lowpan_netdev_ops = { static void lowpan_setup(struct net_device *ldev) { - ldev->addr_len = IEEE802154_ADDR_LEN; memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); - ldev->type = ARPHRD_6LOWPAN; /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); - ldev->mtu = IPV6_MIN_MTU; - ldev->priv_flags |= IFF_NO_QUEUE; ldev->flags = IFF_BROADCAST | IFF_MULTICAST; ldev->netdev_ops = &lowpan_netdev_ops; -- cgit v1.1 From 46234253b9363894a254844a6550b4cc5f3edfe8 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 8 Oct 2015 01:20:35 +0200 Subject: net: move net_get_random_once to lib There's no good reason why users outside of networking should not be using this facility, f.e. for initializing their seeds. Therefore, make it accessible from there as get_random_once(). Signed-off-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/utils.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 3dffce9..3d17ca8 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -348,52 +348,3 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, } } EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); - -struct __net_random_once_work { - struct work_struct work; - struct static_key *key; -}; - -static void __net_random_once_deferred(struct work_struct *w) -{ - struct __net_random_once_work *work = - container_of(w, struct __net_random_once_work, work); - BUG_ON(!static_key_enabled(work->key)); - static_key_slow_dec(work->key); - kfree(work); -} - -static void __net_random_once_disable_jump(struct static_key *key) -{ - struct __net_random_once_work *w; - - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return; - - INIT_WORK(&w->work, __net_random_once_deferred); - w->key = key; - schedule_work(&w->work); -} - -bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *once_key) -{ - static DEFINE_SPINLOCK(lock); - unsigned long flags; - - spin_lock_irqsave(&lock, flags); - if (*done) { - spin_unlock_irqrestore(&lock, flags); - return false; - } - - get_random_bytes(buf, nbytes); - *done = true; - spin_unlock_irqrestore(&lock, flags); - - __net_random_once_disable_jump(once_key); - - return true; -} -EXPORT_SYMBOL(__net_get_random_once); -- cgit v1.1 From 3ad0040573b0c00f88488bc31958acd07a55ee2e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Oct 2015 01:20:39 +0200 Subject: bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs While recently arguing on a seccomp discussion that raw prandom_u32() access shouldn't be exposed to unpriviledged user space, I forgot the fact that SKF_AD_RANDOM extension actually already does it for some time in cBPF via commit 4cd3675ebf74 ("filter: added BPF random opcode"). Since prandom_u32() is being used in a lot of critical networking code, lets be more conservative and split their states. Furthermore, consolidate eBPF and cBPF prandom handlers to use the new internal PRNG. For eBPF, bpf_get_prandom_u32() was only accessible for priviledged users, but should that change one day, we also don't want to leak raw sequences through things like eBPF maps. One thought was also to have own per bpf_prog states, but due to ABI reasons this is not easily possible, i.e. the program code currently cannot access bpf_prog itself, and copying the rnd_state to/from the stack scratch space whenever a program uses the prng seems not really worth the trouble and seems too hacky. If needed, taus113 could in such cases be implemented within eBPF using a map entry to keep the state space, or get_random_bytes() could become a second helper in cases where performance would not be critical. Both sides can trigger a one-time late init via prandom_init_once() on the shared state. Performance-wise, there should even be a tiny gain as bpf_user_rnd_u32() saves one function call. The PRNG needs to live inside the BPF core since kernels could have a NET-less config as well. Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Cc: Chema Gonzalez Signed-off-by: David S. Miller --- net/core/filter.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 8f4603c..342e6c8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -149,12 +149,6 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } -/* note that this only generates 32-bit random numbers */ -static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) -{ - return prandom_u32(); -} - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -313,7 +307,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, *insn = BPF_EMIT_CALL(__get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: - *insn = BPF_EMIT_CALL(__get_random_u32); + *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); + bpf_user_rnd_init_once(); break; } break; -- cgit v1.1 From f640ee98bbeaa169684a571e0b96bea563bb6015 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 8 Oct 2015 12:35:42 +0200 Subject: Bluetooth: Fix basic debugfs entries for unconfigured controllers When the controller is unconfigured (for example it does not have a valid Bluetooth address), then the basic debugfs entries for dut_mode and vendor_diag are not creates. Ensure they are created in __hci_init and also __hci_unconf_init functions. One of them is called during setup stage of a new controller. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b2095ca..d2b3dd3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -184,6 +184,16 @@ static const struct file_operations vendor_diag_fops = { .llseek = default_llseek, }; +static void hci_debugfs_create_basic(struct hci_dev *hdev) +{ + debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, + &dut_mode_fops); + + if (hdev->set_diag) + debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, + &vendor_diag_fops); +} + /* ---- HCI requests ---- */ static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, @@ -900,20 +910,8 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - if (hci_dev_test_flag(hdev, HCI_SETUP)) { - /* The Device Under Test (DUT) mode is special and available - * for all controller types. So just create it early on. - */ - debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, - &dut_mode_fops); - - /* When the driver supports the set_diag callback, then - * expose an entry to modify the vendor diagnostic setting. - */ - if (hdev->set_diag) - debugfs_create_file("vendor_diag", 0644, hdev->debugfs, - hdev, &vendor_diag_fops); - } + if (hci_dev_test_flag(hdev, HCI_SETUP)) + hci_debugfs_create_basic(hdev); err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); if (err < 0) @@ -990,6 +988,9 @@ static int __hci_unconf_init(struct hci_dev *hdev) if (err < 0) return err; + if (hci_dev_test_flag(hdev, HCI_SETUP)) + hci_debugfs_create_basic(hdev); + return 0; } -- cgit v1.1 From 61d03535e4be3a46c1e171a25458237e343195e3 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 8 Oct 2015 21:28:54 +0800 Subject: net/netlink: lockdep_genl_is_held can be boolean This patch makes lockdep_genl_is_held return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 75724a9..bc0e504 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -39,7 +39,7 @@ void genl_unlock(void) EXPORT_SYMBOL(genl_unlock); #ifdef CONFIG_LOCKDEP -int lockdep_genl_is_held(void) +bool lockdep_genl_is_held(void) { return lockdep_is_held(&genl_mutex); } -- cgit v1.1 From 875e08294911b3cb8c60416d64d990809421de29 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 8 Oct 2015 21:28:56 +0800 Subject: net/nfnetlink: lockdep_nfnl_is_held can be boolean This patch makes lockdep_nfnl_is_held return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: David S. Miller --- net/netfilter/nfnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 70277b1..f1d9e88 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -64,7 +64,7 @@ void nfnl_unlock(__u8 subsys_id) EXPORT_SYMBOL_GPL(nfnl_unlock); #ifdef CONFIG_PROVE_LOCKING -int lockdep_nfnl_is_held(u8 subsys_id) +bool lockdep_nfnl_is_held(u8 subsys_id) { return lockdep_is_held(&table[subsys_id].mutex); } -- cgit v1.1 From 45ae74f56162e7a017c3a4e130cf1bcd8d2d17cc Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 8 Oct 2015 21:28:59 +0800 Subject: net/dccp: dccp_bad_service_code can be boolean This patch makes dccp_bad_service_code return bool due to these particular functions only using either one or zero as their return value. dccp_list_has_service is also been made return bool in this patchset. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: David S. Miller --- net/dccp/dccp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e1f8234..923f5a1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -325,13 +325,13 @@ void dccp_send_close(struct sock *sk, const int active); int dccp_invalid_packet(struct sk_buff *skb); u32 dccp_sample_rtt(struct sock *sk, long delta); -static inline int dccp_bad_service_code(const struct sock *sk, +static inline bool dccp_bad_service_code(const struct sock *sk, const __be32 service) { const struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_service == service) - return 0; + return false; return !dccp_list_has_service(dp->dccps_service_list, service); } -- cgit v1.1 From 0cbf334376d5e82d7a2f5cd234ca4f5d0843f3ea Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 8 Oct 2015 21:29:02 +0800 Subject: net/core: lockdep_rtnl_is_held can be boolean This patch makes lockdep_rtnl_is_held return bool due to this particular function only using either one or zero as its return value. In another patch lockdep_is_held is also made return bool. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b2258a3..2477595 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -96,7 +96,7 @@ int rtnl_is_locked(void) EXPORT_SYMBOL(rtnl_is_locked); #ifdef CONFIG_PROVE_LOCKING -int lockdep_rtnl_is_held(void) +bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } -- cgit v1.1 From b6191aeeec1045decb5964e6b5e8c314f5982c85 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 7 Oct 2015 17:27:43 -0400 Subject: net/core: make sock_diag.c explicitly non-modular The Makefile currently controlling compilation of this code lists it under "obj-y" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We can change to one of the other priority initcalls (subsys?) at any later date, if desired. We can't remove module.h since the file uses other module related stuff even though it is not modular itself. We move the information from the MODULE_LICENSE tag to the top of the file, since that information is not captured anywhere else. The MODULE_ALIAS_NET_PF_PROTO becomes a no-op in the non modular case, so it is removed. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Nicolas Dichtel Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Craig Gallek Cc: netdev@vger.kernel.org Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/core/sock_diag.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 817622f..0c1d58d 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,3 +1,5 @@ +/* License: GPL */ + #include #include #include @@ -323,14 +325,4 @@ static int __init sock_diag_init(void) BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } - -static void __exit sock_diag_exit(void) -{ - unregister_pernet_subsys(&diag_net_ops); - destroy_workqueue(broadcast_wq); -} - -module_init(sock_diag_init); -module_exit(sock_diag_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); +device_initcall(sock_diag_init); -- cgit v1.1 From 36b9ad8084bd7ecf6d2241beca23e71f5f4b0cf1 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 7 Oct 2015 17:27:44 -0400 Subject: net/dcb: make dcbnl.c explicitly non-modular The Kconfig currently controlling compilation of this code is: net/dcb/Kconfig:config DCB net/dcb/Kconfig: bool "Data Center Bridging support" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We can change to one of the other priority initcalls (subsys?) at any later date, if desired. We also delete the MODULE_LICENSE tag etc. since all that information is (or is now) already contained at the top of the file in the comments. Cc: "David S. Miller" Cc: Or Gerlitz Cc: Anish Bhatt Cc: John Fastabend Cc: Shani Michaeli Cc: netdev@vger.kernel.org Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 5b21f6f..4f6c186 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -13,6 +13,7 @@ * You should have received a copy of the GNU General Public License along with * this program; if not, see . * + * Description: Data Center Bridging netlink interface * Author: Lucy Liu */ @@ -24,7 +25,7 @@ #include #include #include -#include +#include #include /* Data Center Bridging (DCB) is a collection of Ethernet enhancements @@ -48,10 +49,6 @@ * features for capable devices. */ -MODULE_AUTHOR("Lucy Liu, "); -MODULE_DESCRIPTION("Data Center Bridging netlink interface"); -MODULE_LICENSE("GPL"); - /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ @@ -1935,19 +1932,6 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); -static void dcb_flushapp(void) -{ - struct dcb_app_type *app; - struct dcb_app_type *tmp; - - spin_lock_bh(&dcb_lock); - list_for_each_entry_safe(app, tmp, &dcb_app_list, list) { - list_del(&app->list); - kfree(app); - } - spin_unlock_bh(&dcb_lock); -} - static int __init dcbnl_init(void) { INIT_LIST_HEAD(&dcb_app_list); @@ -1957,12 +1941,4 @@ static int __init dcbnl_init(void) return 0; } -module_init(dcbnl_init); - -static void __exit dcbnl_exit(void) -{ - rtnl_unregister(PF_UNSPEC, RTM_GETDCB); - rtnl_unregister(PF_UNSPEC, RTM_SETDCB); - dcb_flushapp(); -} -module_exit(dcbnl_exit); +device_initcall(dcbnl_init); -- cgit v1.1 From 075640e364f3b46311766f0eff28bd3695637e16 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 7 Oct 2015 17:27:45 -0400 Subject: net/sched: make sch_blackhole.c explicitly non-modular The Kconfig currently controlling compilation of this code is: net/sched/Kconfig:menuconfig NET_SCHED net/sched/Kconfig: bool "QoS and/or fair queueing" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We can change to one of the other priority initcalls (subsys?) at any later date, if desired. We also delete the MODULE_LICENSE tag since all that information is already contained at the top of the file in the comments. Cc: Jamal Hadi Salim Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/sched/sch_blackhole.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 094a874..3fee70d 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -11,7 +11,7 @@ * Note: Quantum tunneling is not supported. */ -#include +#include #include #include #include @@ -37,17 +37,8 @@ static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; -static int __init blackhole_module_init(void) +static int __init blackhole_init(void) { return register_qdisc(&blackhole_qdisc_ops); } - -static void __exit blackhole_module_exit(void) -{ - unregister_qdisc(&blackhole_qdisc_ops); -} - -module_init(blackhole_module_init) -module_exit(blackhole_module_exit) - -MODULE_LICENSE("GPL"); +device_initcall(blackhole_init) -- cgit v1.1 From ff936a04e5f28b7e0455be0e7fa91334f89e4b44 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 7 Oct 2015 10:55:41 -0700 Subject: bpf: fix cb access in socket filter programs eBPF socket filter programs may see junk in 'u32 cb[5]' area, since it could have been used by protocol layers earlier. For socket filter programs used in af_packet we need to clean 20 bytes of skb->cb area if it could be used by the program. For programs attached to TCP/UDP sockets we need to save/restore these 20 bytes, since it's used by protocol layers. Remove SK_RUN_FILTER macro, since it's no longer used. Long term we may move this bpf cb area to per-cpu scratch, but that requires addition of new 'per-cpu load/store' instructions, so not suitable as a short term fix. Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") Reported-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 12 +++++++----- net/packet/af_packet.c | 10 +++++----- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 342e6c8..5f4cf1c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -56,10 +56,10 @@ * @sk: sock associated with &sk_buff * @skb: buffer to filter * - * Run the filter code and then cut skb->data to correct size returned by - * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller + * Run the eBPF program and then cut skb->data to correct size returned by + * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to SK_RUN_FILTER. It returns 0 if the packet should + * wrapper to BPF_PROG_RUN. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -83,7 +83,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } @@ -1736,7 +1736,8 @@ static bool tc_cls_act_is_valid_access(int off, int size, static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, - struct bpf_insn *insn_buf) + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1827,6 +1828,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + prog->cb_access = 1; ctx_off -= offsetof(struct __sk_buff, cb[0]); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct qdisc_skb_cb, data); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 81c900f..104910f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1423,7 +1423,7 @@ static unsigned int fanout_demux_bpf(struct packet_fanout *f, rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) - ret = BPF_PROG_RUN(prog, skb) % num; + ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; @@ -1939,16 +1939,16 @@ out_free: return err; } -static unsigned int run_filter(const struct sk_buff *skb, - const struct sock *sk, - unsigned int res) +static unsigned int run_filter(struct sk_buff *skb, + const struct sock *sk, + unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) - res = SK_RUN_FILTER(filter, skb); + res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; -- cgit v1.1 From e446f9dfe17bbaa76a1fe22912636f38be1e1af8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 05:01:55 -0700 Subject: net: synack packets can be attached to request sockets selinux needs few changes to accommodate fact that SYNACK messages can be attached to a request socket, lacking sk_security pointer (Only syncookies are still attached to a TCP_LISTEN socket) Adds a new sk_listener() helper, and use it in selinux and sch_fq Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Reported by: kernel test robot Cc: Paul Moore Cc: Stephen Smalley Cc: Eric Paris Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 3386cce..109b232 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -225,6 +225,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) return &q->internal; /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, * they do not contain sk_pacing_rate * 2) They are not part of a 'flow' yet @@ -232,7 +233,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) * especially if the listener set SO_MAX_PACING_RATE * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) { + if (!sk || sk_listener(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not -- cgit v1.1 From 146a32067b3fde1424d737d7fb333eb0951e6419 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 8 Oct 2015 11:35:12 -0400 Subject: net: dsa: add port_fdb_prepare Push the prepare phase for FDB operations down to the DSA drivers, with a new port_fdb_prepare function. Currently only mv88e6xxx is affected. Signed-off-by: Vivien Didelot Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/dsa/slave.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4f607bc..48e8c15 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -346,10 +346,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; + int ret; + + if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) + return -EOPNOTSUPP; if (switchdev_trans_ph_prepare(trans)) - ret = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP; + ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); else ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid); -- cgit v1.1 From 1f36faf26943f5f5fc1d1a7be6ce252d2ff25e1a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 8 Oct 2015 11:35:13 -0400 Subject: net: dsa: push prepare phase in port_fdb_add Now that the prepare phase is pushed down to the DSA drivers, propagate it to the port_fdb_add function. Signed-off-by: Vivien Didelot Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 48e8c15..6f7f27e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -354,7 +354,7 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, if (switchdev_trans_ph_prepare(trans)) ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); else - ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid); + ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans); return ret; } -- cgit v1.1 From 8057b3e7a1cfb4da61717ba609e1ea642bb82f9b Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 8 Oct 2015 11:35:14 -0400 Subject: net: dsa: use switchdev obj in port_fdb_del For consistency with the FDB add operation, propagate the switchdev_obj_port_fdb structure in the DSA drivers. Signed-off-by: Vivien Didelot Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6f7f27e..bb2bd3b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -367,7 +367,7 @@ static int dsa_slave_port_fdb_del(struct net_device *dev, int ret = -EOPNOTSUPP; if (ds->drv->port_fdb_del) - ret = ds->drv->port_fdb_del(ds, p->port, fdb->addr, fdb->vid); + ret = ds->drv->port_fdb_del(ds, p->port, fdb); return ret; } -- cgit v1.1 From 6bcfd7f8c28887a4298bc4386b02cb90c9fa0c13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 11:16:48 -0700 Subject: tcp: fix RFS vs lockless listeners Before recent TCP listener patches, we were updating listener sk->sk_rxhash before the cloning of master socket. children sk_rxhash was therefore correct after the normal 3WHS. But with lockless listener, we no longer dirty/change listener sk_rxhash as it would be racy. We need to correctly update the child sk_rxhash, otherwise first data packet wont hit correct cpu if RFS is used. Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Reported-by: Willem de Bruijn Cc: Tom Herbert Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 1 + net/ipv4/tcp_minisocks.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 8113c30..2dbb113 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -225,6 +225,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) { atomic_set(&req->rsk_refcnt, 1); + sock_rps_save_rxhash(child, skb); inet_csk_reqsk_queue_add(sk, req, child); } else { reqsk_free(req); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9adf1e2..1079e6a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -768,6 +768,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); -- cgit v1.1 From 3741873b4f73b572b8f8835e6bd114e08316a160 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 8 Oct 2015 10:38:52 -0700 Subject: bridge: allow adding of fdb entries pointing to the bridge device This patch enables adding of fdb entries pointing to the bridge device. This can be used to propagate mac address of vlan interfaces configured on top of the vlan filtering bridge. Before: $bridge fdb add 44:38:39:00:27:9f dev bridge RTNETLINK answers: Invalid argument After: $bridge fdb add 44:38:39:00:27:9f dev bridge Signed-off-by: Roopa Prabhu Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 122 ++++++++++++++++++++++++++++++++++++++------------- net/bridge/br_vlan.c | 1 + 2 files changed, 93 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 7f7d551..f43ce05 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -608,13 +608,14 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } -static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb) +static int fdb_to_nud(const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb) { if (fdb->is_local) return NUD_PERMANENT; else if (fdb->is_static) return NUD_NOARP; - else if (has_expired(fdb->dst->br, fdb)) + else if (has_expired(br, fdb)) return NUD_STALE; else return NUD_REACHABLE; @@ -640,7 +641,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0; ndm->ndm_type = 0; ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; - ndm->ndm_state = fdb_to_nud(fdb); + ndm->ndm_state = fdb_to_nud(br, fdb); if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr)) goto nla_put_failure; @@ -785,7 +786,7 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, } } - if (fdb_to_nud(fdb) != state) { + if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { fdb->is_local = 1; if (!fdb->is_static) { @@ -846,8 +847,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], const unsigned char *addr, u16 vid, u16 nlh_flags) { struct net_bridge_vlan_group *vg; - struct net_bridge_port *p; + struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; + struct net_bridge *br = NULL; int err = 0; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { @@ -860,26 +862,36 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - p = br_port_get_rtnl(dev); - if (p == NULL) { - pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", - dev->name); - return -EINVAL; + if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + vg = nbp_vlan_group(p); } - vg = nbp_vlan_group(p); if (vid) { v = br_vlan_find(vg, vid); - if (!v) { - pr_info("bridge: RTM_NEWNEIGH with unconfigured " - "vlan %d on port %s\n", vid, dev->name); + if (!v || !br_vlan_should_use(v)) { + pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, vid); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, vid); } else { - err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, 0); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, 0); if (err || !vg || !vg->num_vlans) goto out; @@ -888,7 +900,13 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], * vlan on this port. */ list_for_each_entry(v, &vg->vlan_list, vlist) { - err = __br_fdb_add(ndm, p, addr, nlh_flags, v->vid); + if (!br_vlan_should_use(v)) + continue; + if (dev->priv_flags & IFF_EBRIDGE) + err = br_fdb_insert(br, NULL, addr, v->vid); + else + err = __br_fdb_add(ndm, p, addr, nlh_flags, + v->vid); if (err) goto out; } @@ -898,6 +916,32 @@ out: return err; } +static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, + u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr, vid); + if (!fdb) + return -ENOENT; + + fdb_delete(br, fdb); + return 0; +} + +static int __br_fdb_delete_by_addr(struct net_bridge *br, + const unsigned char *addr, u16 vid) +{ + int err; + + spin_lock_bh(&br->hash_lock); + err = fdb_delete_by_addr(br, addr, vid); + spin_unlock_bh(&br->hash_lock); + + return err; +} + static int fdb_delete_by_addr_and_port(struct net_bridge_port *p, const u8 *addr, u16 vlan) { @@ -931,35 +975,53 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], const unsigned char *addr, u16 vid) { struct net_bridge_vlan_group *vg; - struct net_bridge_port *p; + struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; + struct net_bridge *br = NULL; int err; - p = br_port_get_rtnl(dev); - if (p == NULL) { - pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", - dev->name); - return -EINVAL; + if (dev->priv_flags & IFF_EBRIDGE) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + vg = nbp_vlan_group(p); } - vg = nbp_vlan_group(p); if (vid) { v = br_vlan_find(vg, vid); if (!v) { - pr_info("bridge: RTM_DELNEIGH with unconfigured " - "vlan %d on port %s\n", vid, dev->name); + pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } - err = __br_fdb_delete(p, addr, vid); + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, vid); + else + err = __br_fdb_delete(p, addr, vid); } else { err = -ENOENT; - err &= __br_fdb_delete(p, addr, 0); + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, 0); + else + err &= __br_fdb_delete(p, addr, 0); + if (!vg || !vg->num_vlans) goto out; - list_for_each_entry(v, &vg->vlan_list, vlist) - err &= __br_fdb_delete(p, addr, v->vid); + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (dev->priv_flags & IFF_EBRIDGE) + err = __br_fdb_delete_by_addr(br, addr, v->vid); + else + err &= __br_fdb_delete(p, addr, v->vid); + } } out: return err; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index eae07ee..7a95e31 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -564,6 +564,7 @@ int br_vlan_delete(struct net_bridge *br, u16 vid) return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); + br_fdb_delete_by_port(br, NULL, vid, 0); return __vlan_del(v); } -- cgit v1.1 From 7533ce3055bbe9577276a847125b156c44a5bbce Mon Sep 17 00:00:00 2001 From: Richard Sailer Date: Fri, 9 Oct 2015 02:41:37 +0200 Subject: tcp: change type of alive from int to bool The alive parameter of tcp_orphan_retries, indicates whether the connection is assumed alive or not. In the function and all places calling it is used as a boolean value. Therefore this changes the type of alive to bool in the function definition and all calling locations. Since tcp_orphan_tries is a tcp_timer.c local function no change in any other file or header is necessary. Signed-off-by: Richard Sailer Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7149ebc..c9c716a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) } /* Calculate maximal number or retries on an orphaned socket. */ -static int tcp_orphan_retries(struct sock *sk, int alive) +static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sysctl_tcp_orphan_retries; /* May be zero. */ @@ -184,7 +184,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -298,7 +298,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) -- cgit v1.1 From 464314ea6c119ebc22ee78453e63814453c31611 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 8 Oct 2015 19:23:18 -0700 Subject: switchdev: skip over ports returning -EOPNOTSUPP when recursing ports This allows us to recurse over all the ports, skipping over unsupporting ports. Without the change, the recursion would stop at first unsupported port. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 6e4a4f9..7a9ab90 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -147,7 +147,7 @@ static int __switchdev_port_attr_set(struct net_device *dev, return ops->switchdev_port_attr_set(dev, attr, trans); if (attr->flags & SWITCHDEV_F_NO_RECURSE) - return err; + goto done; /* Switch device port(s) may be stacked under * bond/team/vlan dev, so recurse down to set attr on @@ -156,10 +156,17 @@ static int __switchdev_port_attr_set(struct net_device *dev, netdev_for_each_lower_dev(dev, lower_dev, iter) { err = __switchdev_port_attr_set(lower_dev, attr, trans); + if (err == -EOPNOTSUPP && + attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP) + continue; if (err) break; } +done: + if (err == -EOPNOTSUPP && attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP) + err = 0; + return err; } -- cgit v1.1 From c62987bbd8a1a1664f99e89e3959339350a6131e Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 8 Oct 2015 19:23:19 -0700 Subject: bridge: push bridge setting ageing_time down to switchdev Use SWITCHDEV_F_SKIP_EOPNOTSUPP to skip over ports in bridge that don't support setting ageing_time (or setting bridge attrs in general). If push fails, don't update ageing_time in bridge and return err to user. If push succeeds, update ageing_time in bridge and run gc_timer now to recalabrate when to run gc_timer next, based on new ageing_time. Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 3 +-- net/bridge/br_netlink.c | 6 +++--- net/bridge/br_private.h | 1 + net/bridge/br_stp.c | 23 +++++++++++++++++++++++ net/bridge/br_sysfs_br.c | 3 +-- 5 files changed, 29 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 8d423bc..263b4de 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -200,8 +200,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br->ageing_time = clock_t_to_jiffies(args[1]); - return 0; + return br_set_ageing_time(br, args[1]); case BRCTL_GET_PORT_INFO: { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d78b442..544ab96 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -870,9 +870,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } if (data[IFLA_BR_AGEING_TIME]) { - u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); - - br->ageing_time = clock_t_to_jiffies(ageing_time); + err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); + if (err) + return err; } if (data[IFLA_BR_STP_STATE]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 09d3ecb..ba0c67b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -882,6 +882,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); +int br_set_ageing_time(struct net_bridge *br, u32 ageing_time); /* br_stp_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 3a982c0..db6d243de 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -566,6 +566,29 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) } +int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.ageing_time = ageing_time, + }; + unsigned long t = clock_t_to_jiffies(ageing_time); + int err; + + if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME) + return -ERANGE; + + err = switchdev_port_attr_set(br->dev, &attr); + if (err) + return err; + + br->ageing_time = t; + mod_timer(&br->gc_timer, jiffies); + + return 0; +} + void __br_set_forward_delay(struct net_bridge *br, unsigned long t) { br->bridge_forward_delay = t; diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 4c97fc5..04ef192 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -102,8 +102,7 @@ static ssize_t ageing_time_show(struct device *d, static int set_ageing_time(struct net_bridge *br, unsigned long val) { - br->ageing_time = clock_t_to_jiffies(val); - return 0; + return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, -- cgit v1.1 From 1be7f75d1668d6296b80bf35dcf6762393530afc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 7 Oct 2015 22:23:21 -0700 Subject: bpf: enable non-root eBPF programs In order to let unprivileged users load and execute eBPF programs teach verifier to prevent pointer leaks. Verifier will prevent - any arithmetic on pointers (except R10+Imm which is used to compute stack addresses) - comparison of pointers (except if (map_value_ptr == 0) ... ) - passing pointers to helper functions - indirectly passing pointers in stack to helper functions - returning pointer from bpf program - storing pointers into ctx or maps Spill/fill of pointers into stack is allowed, but mangling of pointers stored in the stack or reading them byte by byte is not. Within bpf programs the pointers do exist, since programs need to be able to access maps, pass skb pointer to LD_ABS insns, etc but programs cannot pass such pointer values to the outside or obfuscate them. Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs, so that socket filters (tcpdump), af_packet (quic acceleration) and future kcm can use it. tracing and tc cls/act program types still require root permissions, since tracing actually needs to be able to see all kernel pointers and tc is for root only. For example, the following unprivileged socket filter program is allowed: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += skb->len; return 0; } but the following program is not: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += (u64) skb; return 0; } since it would leak the kernel address into the map. Unprivileged socket filter bpf programs have access to the following helper functions: - map lookup/update/delete (but they cannot store kernel pointers into them) - get_random (it's already exposed to unprivileged user space) - get_smp_processor_id - tail_call into another socket filter program - ktime_get_ns The feature is controlled by sysctl kernel.unprivileged_bpf_disabled. This toggle defaults to off (0), but can be set true (1). Once true, bpf programs and maps cannot be accessed from unprivileged process, and the toggle cannot be set back to false. Signed-off-by: Alexei Starovoitov Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5f4cf1c..0b00094 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1640,7 +1640,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); default: return NULL; } -- cgit v1.1 From f28ea365cdefc3b4fd0373e70b0106a0cd9b4c23 Mon Sep 17 00:00:00 2001 From: Edward Jee Date: Thu, 8 Oct 2015 14:56:48 -0700 Subject: sock: support per-packet fwmark It's useful to allow users to set fwmark for an individual packet, without changing the socket state. The function this patch adds in sock layer can be used by the protocols that need such a feature. Signed-off-by: Edward Hyunkoo Jee Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 7dd1263..3395777 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1852,6 +1852,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) -- cgit v1.1 From c7d39e32632e5db9dc4da51198b76d8c315946ff Mon Sep 17 00:00:00 2001 From: Edward Jee Date: Thu, 8 Oct 2015 14:56:49 -0700 Subject: packet: support per-packet fwmark for af_packet sendmsg Signed-off-by: Edward Hyunkoo Jee Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 104910f..20c44e2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2630,6 +2630,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) __be16 proto; unsigned char *addr; int err, reserve = 0; + struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; int vnet_hdr_len; @@ -2665,6 +2666,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.mark = sk->sk_mark; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto out_unlock; + } + if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { @@ -2774,7 +2782,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc.mark; packet_pick_tx_queue(dev, skb); -- cgit v1.1 From 70da268b569d32a9fddeea85dc18043de9d89f89 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:21 -0700 Subject: net: SO_INCOMING_CPU setsockopt() support SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command to fetch incoming cpu handling a particular TCP flow after accept() This commits adds setsockopt() support and extends SO_REUSEPORT selection logic : If a TCP listener or UDP socket has this option set, a packet is delivered to this socket only if CPU handling the packet matches the specified one. This allows to build very efficient TCP servers, using one listener per RX queue, as the associated TCP listener should only accept flows handled in softirq by the same cpu. This provides optimal NUMA behavior and keep cpu caches hot. Note that __inet_lookup_listener() still has to iterate over the list of all listeners. Following patch puts sk_refcnt in a different cache line to let this iteration hit only shared and read mostly cache lines. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 5 +++++ net/ipv4/inet_hashtables.c | 2 ++ net/ipv4/udp.c | 6 +++++- net/ipv6/inet6_hashtables.c | 2 ++ net/ipv6/udp.c | 11 +++++++---- 5 files changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 3395777..dcc7d62 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -988,6 +988,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -2379,6 +2383,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index bed8886..08643a3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -185,6 +185,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e1fc129..24ec14f9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; return score; } @@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 6ac8dad..21ace5a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0aba654..01bcb49 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } -#define SCORE2_MAX (1 + 1 + 1) static inline int compute_score2(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, @@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score++; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -251,8 +256,7 @@ begin: hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); matches = 1; - } else if (score == SCORE2_MAX) - goto exact_match; + } } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -269,7 +273,6 @@ begin: goto begin; if (result) { -exact_match: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, -- cgit v1.1 From ed53d0ab761f5c71d77c8dc05fd19c0a851200db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:23 -0700 Subject: net: shrink struct sock and request_sock by 8 bytes One 32bit hole is following skc_refcnt, use it. skc_incoming_cpu can also be an union for request_sock rcv_wnd. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 18 +++++++++--------- net/ipv4/tcp_output.c | 2 +- net/ipv6/syncookies.c | 4 ++-- net/ipv6/tcp_ipv6.c | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2dbb113..4c0892b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -382,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ddadb31..3b35c3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6022,7 +6022,7 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3431074..ddb1983 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1079e6a..41828bd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req, window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > full_space || req->window_clamp == 0)) - req->window_clamp = full_space; + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -512,9 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 55ed3266..6e79fcb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f610b53..bb8f2fa 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -235,9 +235,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 33334f0..2887c84 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -931,7 +931,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); -- cgit v1.1 From 0944d6b5a2fad9ba3b7abb2e94a6b7d40cd4a935 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 9 Oct 2015 13:54:11 +0200 Subject: bridge: try switchdev op first in __vlan_vid_add/del Some drivers need to implement both switchdev vlan ops and vid_add/kill ndos. For that to work in bridge code, we need to try switchdev op first when adding/deleting vlan id. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 58 ++++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 7a95e31..ad7e4f6 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -72,28 +72,20 @@ static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, u16 vid, u16 flags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct switchdev_obj_port_vlan v = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; int err; - /* If driver uses VLAN ndo ops, use 8021q to install vid - * on device, otherwise try switchdev ops to install vid. + /* Try switchdev op first. In case it is not supported, fallback to + * 8021q add. */ - - if (ops->ndo_vlan_rx_add_vid) { - err = vlan_vid_add(dev, br->vlan_proto, vid); - } else { - struct switchdev_obj_port_vlan v = { - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .flags = flags, - .vid_begin = vid, - .vid_end = vid, - }; - - err = switchdev_port_obj_add(dev, &v.obj); - if (err == -EOPNOTSUPP) - err = 0; - } - + err = switchdev_port_obj_add(dev, &v.obj); + if (err == -EOPNOTSUPP) + return vlan_vid_add(dev, br->vlan_proto, vid); return err; } @@ -122,27 +114,21 @@ static void __vlan_del_list(struct net_bridge_vlan *v) static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, u16 vid) { - const struct net_device_ops *ops = dev->netdev_ops; - int err = 0; + struct switchdev_obj_port_vlan v = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .vid_begin = vid, + .vid_end = vid, + }; + int err; - /* If driver uses VLAN ndo ops, use 8021q to delete vid - * on device, otherwise try switchdev ops to delete vid. + /* Try switchdev op first. In case it is not supported, fallback to + * 8021q del. */ - - if (ops->ndo_vlan_rx_kill_vid) { + err = switchdev_port_obj_del(dev, &v.obj); + if (err == -EOPNOTSUPP) { vlan_vid_del(dev, br->vlan_proto, vid); - } else { - struct switchdev_obj_port_vlan v = { - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, - }; - - err = switchdev_port_obj_del(dev, &v.obj); - if (err == -EOPNOTSUPP) - err = 0; + return 0; } - return err; } -- cgit v1.1 From e2ca690b657f4ca5c204fcc6470d462b776d73b3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Oct 2015 14:34:31 +0200 Subject: ipv4/icmp: redirect messages can use the ingress daddr as source This patch allows configuring how the source address of ICMP redirect messages is selected; by default the old behaviour is retained, while setting icmp_redirects_use_orig_daddr force the usage of the destination address of the packet that caused the redirect. The new behaviour fits closely the RFC 5798 section 8.1.1, and fix the following scenario: Two machines are set up with VRRP to act as routers out of a subnet, they have IPs x.x.x.1/24 and x.x.x.2/24, with VRRP holding on to x.x.x.254/24. If a host in said subnet needs to get an ICMP redirect from the VRRP router, i.e. to reach a destination behind a different gateway, the source IP in the ICMP redirect is chosen as the primary IP on the interface that the packet arrived at, i.e. x.x.x.1 or x.x.x.2. The host will then ignore said redirect, due to RFC 1122 section 3.2.2.2, and will continue to use the wrong next-op. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 9 ++++++++- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36e2697..f3c356b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -659,7 +659,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) */ saddr = iph->daddr; - if (!(rt->rt_flags & RTCF_LOCAL)) { + if (!((type == ICMP_REDIRECT) && + net->ipv4.sysctl_icmp_redirects_use_orig_daddr) && + !(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; rcu_read_lock(); @@ -1222,6 +1224,11 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + /* Control paramerer - use the daddr of originating packets as saddr + * in redirect messages? + */ + net->ipv4.sysctl_icmp_redirects_use_orig_daddr = 0; + return 0; fail: diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 894da3a..30a531c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -818,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "icmp_redirects_use_orig_daddr", + .data = &init_net.ipv4.sysctl_icmp_redirects_use_orig_daddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), -- cgit v1.1 From 161642e24fee40fba2c5bc2ceacc00d118a22d65 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Oct 2015 11:29:32 -0700 Subject: packet: fix match_fanout_group() Recent TCP listener patches exposed a prior af_packet bug : match_fanout_group() blindly assumes it is always safe to cast sk to a packet socket to compare fanout with af_packet_priv But SYNACK packets can be sent while attached to request_sock, which are smaller than a "struct sock". We can read non existent memory and crash. Fixes: c0de08d04215 ("af_packet: don't emit packet on orig fanout group") Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Eric Leblond Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 20c44e2..396b3f1 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1519,10 +1519,10 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { - if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) - return true; + if (sk->sk_family != PF_PACKET) + return false; - return false; + return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) -- cgit v1.1 From 37fcbab61b8ecf75cb5fd81e5809b71c270f9632 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 9 Oct 2015 13:44:53 -0500 Subject: ipv4: Only compute net once in ip_call_ra_chain ip_call_ra_chain is called early in the forwarding chain from ip_forward and ip_mr_input, which makes skb->dev the correct expression to get the input network device and dev_net(skb->dev) a correct expression for the network namespace the packet is being processed in. Compute the network namespace and store it in a variable to make the code clearer. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7cc9f7b..804b86f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; @@ -167,7 +168,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), dev_net(dev))) { + net_eq(sock_net(sk), net)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; -- cgit v1.1 From 19bcf9f203c82c2028f5a0881b1f0690e3207190 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 9 Oct 2015 13:44:54 -0500 Subject: ipv4: Pass struct net into ip_defrag and ip_check_defrag The function ip_defrag is called on both the input and the output paths of the networking stack. In particular conntrack when it is tracking outbound packets from the local machine calls ip_defrag. So add a struct net parameter and stop making ip_defrag guess which network namespace it needs to defragment packets in. Signed-off-by: "Eric W. Biederman" Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 7 +++---- net/ipv4/ip_input.c | 7 ++++--- net/ipv4/netfilter/nf_defrag_ipv4.c | 7 ++++--- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/openvswitch/conntrack.c | 2 +- net/packet/af_packet.c | 6 +++--- 6 files changed, 16 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9772b78..5482745 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -654,11 +654,10 @@ out_fail: } /* Process an incoming IP datagram fragment. */ -int ip_defrag(struct sk_buff *skb, u32 user) +int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; int vif = l3mdev_master_ifindex_rcu(dev); - struct net *net = dev_net(dev); struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); @@ -683,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); -struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; @@ -712,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - if (ip_defrag(skb, user)) + if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 804b86f..b1209b6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -170,7 +170,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) sk->sk_bound_dev_if == dev->ifindex) && net_eq(sock_net(sk), net)) { if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { @@ -247,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb) /* * Reassemble IP fragments. */ + struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, + net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index b246346..bf25f45 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,14 +22,15 @@ #endif #include -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, + u_int32_t user) { int err; skb_orphan(skb); local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) { @@ -85,7 +86,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv, enum ip_defrag_users user = nf_ct_defrag_user(state->hook, skb); - if (nf_ct_ipv4_gather_frags(skb, user)) + if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 37dd77a..07a791e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -694,7 +694,7 @@ static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, int err; local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index eb759e3..cb76076 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -304,7 +304,7 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, int err; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - err = ip_defrag(skb, user); + err = ip_defrag(net, skb, user); if (err) return err; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 396b3f1..691660b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1439,17 +1439,17 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); + struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; - if (!net_eq(dev_net(dev), read_pnet(&f->net)) || - !num) { + if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { - skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); + skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } -- cgit v1.1 From b72775977c39dcd380777ff5ea8041fdf67ee382 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 9 Oct 2015 13:44:55 -0500 Subject: ipv6: Pass struct net into nf_ct_frag6_gather The function nf_ct_frag6_gather is called on both the input and the output paths of the networking stack. In particular ipv6_defrag which calls nf_ct_frag6_gather is called from both the the PRE_ROUTING chain on input and the LOCAL_OUT chain on output. The addition of a net parameter makes it explicit which network namespace the packets are being reassembled in, and removes the need for nf_ct_frag6_gather to guess. Signed-off-by: "Eric W. Biederman" Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 +--- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 ++- net/openvswitch/conntrack.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 701cd2b..2fb86a9 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -563,12 +563,10 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) return 0; } -struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) +struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { struct sk_buff *clone; struct net_device *dev = skb->dev; - struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev) - : dev_net(skb->dev); struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index a99baf6..5173a89 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -63,7 +63,8 @@ static unsigned int ipv6_defrag(void *priv, return NF_ACCEPT; #endif - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(state->hook, skb)); + reasm = nf_ct_frag6_gather(state->net, skb, + nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (reasm == NULL) return NF_STOLEN; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index cb76076..ad61426 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -315,7 +315,7 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, struct sk_buff *reasm; memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - reasm = nf_ct_frag6_gather(skb, user); + reasm = nf_ct_frag6_gather(net, skb, user); if (!reasm) return -EINPROGRESS; -- cgit v1.1 From 8c5b83f0f255542b40a1273c32eb067ec00bb2b2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 10 Oct 2015 08:26:36 -0700 Subject: ipv6 route: use err pointers instead of returning pointer by reference This patch makes ip6_route_info_create return err pointer instead of returning the rt pointer by reference as suggested by Dave Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4320ddc..db5b54a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1724,21 +1724,21 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } -int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) +static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { - int err; struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; int addr_type; + int err = -EINVAL; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) - return -EINVAL; + goto out; #ifndef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len) - return -EINVAL; + goto out; #endif if (cfg->fc_ifindex) { err = -ENODEV; @@ -1958,9 +1958,7 @@ install_route: cfg->fc_nlinfo.nl_net = dev_net(dev); - *rt_ret = rt; - - return 0; + return rt; out: if (dev) dev_put(dev); @@ -1969,20 +1967,21 @@ out: if (rt) dst_free(&rt->dst); - *rt_ret = NULL; - - return err; + return ERR_PTR(err); } int ip6_route_add(struct fib6_config *cfg) { struct mx6_config mxc = { .mx = NULL, }; - struct rt6_info *rt = NULL; + struct rt6_info *rt; int err; - err = ip6_route_info_create(cfg, &rt); - if (err) + rt = ip6_route_info_create(cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto out; + } err = ip6_convert_metrics(&mxc, cfg); if (err) @@ -2871,9 +2870,12 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) r_cfg.fc_encap_type = nla_get_u16(nla); } - err = ip6_route_info_create(&r_cfg, &rt); - if (err) + rt = ip6_route_info_create(&r_cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto cleanup; + } err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { -- cgit v1.1 From 6623c60dc28ee966cd85c6f12aa2fc3c952d0179 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 11 Oct 2015 12:49:56 +0200 Subject: bridge: vlan: enforce no pvid flag in vlan ranges Currently it's possible for someone to send a vlan range to the kernel with the pvid flag set which will result in the pvid bouncing from a vlan to vlan and isn't correct, it also introduces problems for hardware where it doesn't make sense having more than 1 pvid. iproute2 already enforces this, so let's enforce it on kernel-side as well. Reported-by: Elad Raz Signed-off-by: Nikolay Aleksandrov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 544ab96..d792d1a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -524,6 +524,9 @@ static int br_afspec(struct net_bridge *br, if (vinfo_start) return -EINVAL; vinfo_start = vinfo; + /* don't allow range of pvids */ + if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID) + return -EINVAL; continue; } -- cgit v1.1 From 486798001b92eacbf9f809787a6348750c174035 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Sun, 11 Oct 2015 16:46:03 -0400 Subject: RDS: Invoke ->laddr_check() in rds_bind() for explicitly bound transports. The IP address passed to rds_bind() should be vetted by the transport's ->laddr_check() for a previously bound transport. This needs to be done to avoid cases where, for example, the application has asked for an IB transport, but the IP address passed to bind is only usable on ethernet interfaces. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/bind.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index bc6b93e..6192566 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -196,7 +196,14 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; if (rs->rs_transport) { /* previously bound */ - ret = 0; + trans = rs->rs_transport; + if (trans->laddr_check(sock_net(sock->sk), + sin->sin_addr.s_addr) != 0) { + ret = -ENOPROTOOPT; + rds_remove_bound(rs); + } else { + ret = 0; + } goto out; } trans = rds_trans_get_preferred(sock_net(sock->sk), -- cgit v1.1 From 241b271952eb319622a6cc740a72abe41fc27ad6 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Sun, 11 Oct 2015 16:49:44 -0400 Subject: RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one() Consider the following "duelling syn" sequence between two peers A and B: A B SYN1 --> <-- SYN2 SYN2ACK --> Note that the SYN/ACK has already been sent out by TCP before rds_tcp_accept_one() gets invoked as part of callbacks. If the inet_addr(A) is numerically less than inet_addr(B), the arbitration scheme in rds_tcp_accept_one() will prefer the TCP connection triggered by SYN1, and will send a CLOSE for the SYN2 (just after the SYN2ACK was sent). Since B also follows the same arbitration scheme, it will send the SYN-ACK for SYN1 that will set up a healthy ESTABLISHED connection on both sides. B will also get a CLOSE for SYN2, which should result in the cleanup of the TCP state machine for SYN2, but it should not trigger any stale RDS-TCP callbacks (such as ->writespace, ->state_change etc), that would disrupt the progress of the SYN2 based RDS-TCP connection. Thus the arbitration scheme in rds_tcp_accept_one() should restore rds_tcp callbacks for the winner before setting them up for the new accept socket, and also make sure that conn->c_outgoing is set to 0 so that we do not trigger any reconnect attempts on the passive side of the tcp socket in the future, in conformance with commit c82ac7e69efe ("net/rds: RDS-TCP: only initiate reconnect attempt on outgoing TCP socket.") Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp_listen.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 1d90240..0936a4a 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -125,6 +125,9 @@ int rds_tcp_accept_one(struct socket *sock) new_sock = NULL; ret = 0; goto out; + } else if (rs_tcp->t_sock) { + rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + conn->c_outgoing = 0; } rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); -- cgit v1.1 From efd29b3d8266761570fd3f440e2d5aa24c678725 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sun, 11 Oct 2015 18:08:37 -0400 Subject: net: dsa: do not warn unsupported bridge ops A DSA driver may not provide the port_join_bridge and port_leave_bridge functions, so don't warn in such case. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bb2bd3b..43d7342 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1276,7 +1276,7 @@ int dsa_slave_netdevice_event(struct notifier_block *unused, goto out; err = dsa_slave_master_changed(dev); - if (err) + if (err && err != -EOPNOTSUPP) netdev_warn(dev, "failed to reflect master change\n"); break; -- cgit v1.1 From cc02aa8e41c50f690d0bb22ed5629468483421b7 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 12 Oct 2015 14:01:39 +0200 Subject: switchdev: enforce no pvid flag in vlan ranges We shouldn't allow BRIDGE_VLAN_INFO_PVID flag in VLAN ranges. Signed-off-by: Nikolay Aleksandrov Acked-by: Elad Raz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 7a9ab90..b8aaf820 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -727,6 +727,9 @@ static int switchdev_port_br_afspec(struct net_device *dev, if (vlan.vid_begin) return -EINVAL; vlan.vid_begin = vinfo->vid; + /* don't allow range of pvids */ + if (vlan.flags & BRIDGE_VLAN_INFO_PVID) + return -EINVAL; } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { if (!vlan.vid_begin) return -EINVAL; -- cgit v1.1 From af3793921d49a772ec1079449219bad4baa0bc96 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 12 Oct 2015 17:55:55 +0200 Subject: bridge: fix gc_timer mod/del race condition commit c62987bbd8a1 ("bridge: push bridge setting ageing_time down to switchdev") introduced a timer race condition because the gc_timer can get rearmed after it's supposedly stopped and flushed in br_dev_delete() leading to a use of freed memory. So take rtnl to sync with bridge destruction when setting ageing_timer. Here's the trace reproduced with these two commands running in parallel: while :; do echo 10000 > /sys/class/net/br0/bridge/ageing_timer; done; while :; do brctl addbr br0; ip l set br0 up; ip l set br0 down; brctl delbr br0; done; [ 300.000029] BUG: unable to handle kernel paging request at ffffffff811c59d3 [ 300.000263] IP: [] __internal_add_timer+0x2e/0xd0 [ 300.000422] PGD 1a0f067 PUD 1a10063 PMD 10001e1 [ 300.000639] Oops: 0003 [#1] SMP [ 300.000793] Modules linked in: bridge stp llc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ppdev aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd snd_hda_codec_generic qxl drm_kms_helper psmouse pcspkr ttm snd_hda_intel 9pnet_virtio evdev serio_raw joydev snd_hda_codec 9pnet virtio_balloon drm snd_hwdep virtio_console snd_hda_core pvpanic snd_pcm i2c_piix4 snd_timer acpi_cpufreq parport_pc snd parport soundcore button processor i2c_core ipv6 autofs4 hid_generic usbhid hid ext4 crc16 mbcache jbd2 sg sr_mod cdrom ata_generic virtio_blk virtio_net e1000 ehci_pci uhci_hcd ehci_hcd usbcore usb_common floppy ata_piix libata virtio_pci virtio_ring virtio scsi_mod [ 300.004008] CPU: 1 PID: 1169 Comm: bash Not tainted 4.3.0-rc3+ #46 [ 300.004008] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 300.004008] task: ffff880035be2200 ti: ffff88003795c000 task.ti: ffff88003795c000 [ 300.004008] RIP: 0010:[] [] __internal_add_timer+0x2e/0xd0 [ 300.004008] RSP: 0018:ffff88003fd03e78 EFLAGS: 00010046 [ 300.004008] RAX: ffff88003fd0ef60 RBX: 840fc78949c08548 RCX: 00000001ffffffff [ 300.004008] RDX: 0000000000000000 RSI: ffffffff811c59d3 RDI: ffff88003fd0df00 [ 300.004008] RBP: ffff88003fd03e78 R08: 00000000ffffffff R09: 0000000000000000 [ 300.004008] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003fd0df00 [ 300.004008] R13: 0000000000000000 R14: 0000000000000001 R15: ffffffff816032e0 [ 300.004008] FS: 00007fcbdd609700(0000) GS:ffff88003fd00000(0000) knlGS:0000000000000000 [ 300.004008] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 300.004008] CR2: ffffffff811c59d3 CR3: 0000000037879000 CR4: 00000000000406e0 [ 300.004008] Stack: [ 300.004008] ffff88003fd03ea8 ffffffff810f1775 ffff88003c8cb958 ffff88003fd0df00 [ 300.004008] 0000000000000000 0000000000000001 ffff88003fd03f18 ffffffff810f28c4 [ 300.004008] ffff88003fd0eb68 ffff88003fd0e968 ffff88003fd0e768 ffff88003fd0df68 [ 300.004008] Call Trace: [ 300.004008] [ 300.004008] [] cascade+0x45/0x70 [ 300.004008] [] run_timer_softirq+0x2f4/0x340 [ 300.004008] [] __do_softirq+0xd0/0x440 [ 300.004008] [] irq_exit+0xb3/0xc0 [ 300.004008] [] smp_apic_timer_interrupt+0x42/0x50 [ 300.004008] [] apic_timer_interrupt+0x87/0x90 [ 300.004008] [ 300.004008] [] ? create_object+0x13c/0x2e0 [ 300.004008] [] ? __kernel_text_address+0x4e/0x70 [ 300.004008] [] ? __kernel_text_address+0x4e/0x70 [ 300.004008] [] print_context_stack+0x7f/0xf0 [ 300.004008] [] dump_trace+0x11b/0x300 [ 300.004008] [] save_stack_trace+0x2b/0x50 [ 300.004008] [] create_object+0x13c/0x2e0 [ 300.004008] [] kmemleak_alloc+0x4e/0xb0 [ 300.004008] [] kmem_cache_alloc_trace+0x18d/0x2f0 [ 300.004008] [] kernfs_fop_open+0xc9/0x380 [ 300.004008] [] do_dentry_open+0x1ff/0x2f0 [ 300.004008] [] ? kernfs_fop_release+0x70/0x70 [ 300.004008] [] vfs_open+0x59/0x60 [ 300.004008] [] path_openat+0x1ce/0x1260 [ 300.004008] [] do_filp_open+0x7e/0xe0 [ 300.004008] [] ? __alloc_fd+0xaf/0x180 [ 300.004008] [] do_sys_open+0x12b/0x210 [ 300.004008] [] SyS_open+0x1e/0x20 [ 300.004008] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 300.004008] Code: 66 90 48 8b 46 10 48 8b 4f 40 55 48 89 c2 48 89 e5 48 29 ca 48 81 fa ff 00 00 00 77 20 0f b6 c0 48 8d 44 c7 68 48 8b 10 48 85 d2 <48> 89 16 74 04 48 89 72 08 48 89 30 48 89 46 08 5d c3 48 81 fa [ 300.004008] RIP [] __internal_add_timer+0x2e/0xd0 [ 300.004008] RSP [ 300.004008] CR2: ffffffff811c59d3 Fixes: c62987bbd8a1 ("bridge: push bridge setting ageing_time down to switchdev") Signed-off-by: Nikolay Aleksandrov Reviewed-by: Jiri Pirko Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/bridge/br_sysfs_br.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 04ef192..8365bd5 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -102,7 +102,15 @@ static ssize_t ageing_time_show(struct device *d, static int set_ageing_time(struct net_bridge *br, unsigned long val) { - return br_set_ageing_time(br, val); + int ret; + + if (!rtnl_trylock()) + return restart_syscall(); + + ret = br_set_ageing_time(br, val); + rtnl_unlock(); + + return ret; } static ssize_t ageing_time_store(struct device *d, -- cgit v1.1 From c4850687783717fa854554965c4bc85625d0e4a8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 12 Oct 2015 11:47:08 -0700 Subject: net: Export fib6_get_table and nd_tbl Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 1 + net/ipv6/ndisc.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7d2e002..09fddf7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -264,6 +264,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) return NULL; } +EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b18012f..9f8a824 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -147,6 +147,7 @@ struct neigh_table nd_tbl = { .gc_thresh2 = 512, .gc_thresh3 = 1024, }; +EXPORT_SYMBOL_GPL(nd_tbl); static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) { -- cgit v1.1 From ca254490c8dfdaddb5df8a763774db0f4c5200c3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 12 Oct 2015 11:47:10 -0700 Subject: net: Add VRF support to IPv6 stack As with IPv4 support for VRFs added to IPv6 stack by replacing hardcoded table ids with possibly device specific ones and manipulating the oif in the flowi6. The flow flags are used to skip oif compare in nexthop lookups if the device is enslaved to a VRF via the L3 master device. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 +++++++++--- net/ipv6/icmp.c | 6 +++++- net/ipv6/ip6_output.c | 6 ++++-- net/ipv6/ndisc.c | 26 +++++++++++++++++++++++--- net/ipv6/route.c | 27 ++++++++++++++++++++++----- 5 files changed, 63 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c8380f1..f0326aa 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -2146,7 +2147,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, unsigned long expires, u32 flags) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_PREFIX, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, .fc_metric = IP6_RT_PRIO_ADDRCONF, .fc_ifindex = dev->ifindex, .fc_expires = expires, @@ -2179,8 +2180,9 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, struct fib6_node *fn; struct rt6_info *rt = NULL; struct fib6_table *table; + u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX; - table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX); + table = fib6_get_table(dev_net(dev), tb_id); if (!table) return NULL; @@ -2211,7 +2213,7 @@ out: static void addrconf_add_mroute(struct net_device *dev) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_LOCAL, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL, .fc_metric = IP6_RT_PRIO_ADDRCONF, .fc_ifindex = dev->ifindex, .fc_dst_len = 8, @@ -3029,6 +3031,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { struct in6_addr addr; + /* no link local addresses on L3 master devices */ + if (netif_is_l3_master(idev->dev)) + return; + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6c2b213..efb1c00 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -68,6 +68,7 @@ #include #include #include +#include #include @@ -496,6 +497,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev); + dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; @@ -575,7 +579,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = skb->dev->ifindex; + fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 32583b5..23f97c4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -55,6 +55,7 @@ #include #include #include +#include static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -885,7 +886,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { dst_release(dst); dst = NULL; } @@ -1037,7 +1039,7 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; if (!fl6->flowi6_oif) - fl6->flowi6_oif = dst->dev->ifindex; + fl6->flowi6_oif = l3mdev_fib_oif(dst->dev); return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 9f8a824..3e0f855 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -442,8 +443,11 @@ static void ndisc_send_skb(struct sk_buff *skb, if (!dst) { struct flowi6 fl6; + int oif = l3mdev_fib_oif(skb->dev); - icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex); + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); + if (oif != skb->dev->ifindex) + fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); @@ -767,7 +771,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { - +have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { /* @@ -793,6 +797,18 @@ static void ndisc_recv_ns(struct sk_buff *skb) } else { struct net *net = dev_net(dev); + /* perhaps an address on the master device */ + if (netif_is_l3_slave(dev)) { + struct net_device *mdev; + + mdev = netdev_master_upper_dev_get_rcu(dev); + if (mdev) { + ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); + if (ifp) + goto have_ifp; + } + } + idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ @@ -1484,6 +1500,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + int oif = l3mdev_fib_oif(dev); bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { @@ -1500,7 +1517,10 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, - &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); + &saddr_buf, &ipv6_hdr(skb)->saddr, oif); + + if (oif != skb->dev->ifindex) + fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index db5b54a..5fc1149 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -1044,6 +1045,9 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) + oif = 0; + redo_rt6_select: rt = rt6_select(fn, oif, strict); if (rt->rt6i_nsiblings) @@ -1141,7 +1145,7 @@ void ip6_route_input(struct sk_buff *skb) int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { - .flowi6_iif = skb->dev->ifindex, + .flowi6_iif = l3mdev_fib_oif(skb->dev), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), @@ -1165,8 +1169,13 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { + struct dst_entry *dst; int flags = 0; + dst = l3mdev_rt6_dst_by_oif(net, fl6); + if (dst) + return dst; + fl6->flowi6_iif = LOOPBACK_IFINDEX; if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || @@ -2263,7 +2272,6 @@ static struct rt6_info *rt6_add_route_info(struct net *net, unsigned int pref) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_INFO, .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = ifindex, .fc_dst_len = prefixlen, @@ -2274,6 +2282,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_nlinfo.nl_net = net, }; + cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; @@ -2314,7 +2323,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, unsigned int pref) { struct fib6_config cfg = { - .fc_table = RT6_TABLE_DFLT, + .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | @@ -2361,7 +2370,8 @@ static void rtmsg_to_fib6_config(struct net *net, { memset(cfg, 0, sizeof(*cfg)); - cfg->fc_table = RT6_TABLE_MAIN; + cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? + : RT6_TABLE_MAIN; cfg->fc_ifindex = rtmsg->rtmsg_ifindex; cfg->fc_metric = rtmsg->rtmsg_metric; cfg->fc_expires = rtmsg->rtmsg_info; @@ -2470,6 +2480,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, bool anycast) { + u32 tb_id; struct net *net = dev_net(idev->dev); struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, DST_NOCOUNT); @@ -2492,7 +2503,8 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; - rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; + rt->rt6i_table = fib6_get_table(net, tb_id); rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); @@ -3254,6 +3266,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) } else { fl6.flowi6_oif = oif; + if (netif_index_is_l3_master(net, oif)) { + fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | + FLOWI_FLAG_SKIP_NH_OIF; + } + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } -- cgit v1.1 From 907b1e6e83ed25d9dece1e55b704581b6c127051 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 12 Oct 2015 21:47:02 +0200 Subject: bridge: vlan: use proper rcu for the vlgrp member The bridge and port's vlgrp member is already used in RCU way, currently we rely on the fact that it cannot disappear while the port exists but that is error-prone and we might miss places with improper locking (either RCU or RTNL must be held to walk the vlan_list). So make it official and use RCU for vlgrp to catch offenders. Introduce proper vlgrp accessors and use them consistently throughout the code. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/bridge/br_device.c | 2 +- net/bridge/br_forward.c | 6 +-- net/bridge/br_input.c | 4 +- net/bridge/br_netlink.c | 4 +- net/bridge/br_private.h | 34 +++++++++++++-- net/bridge/br_vlan.c | 107 +++++++++++++++++++++++++++++------------------- 6 files changed, 104 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index bdfb954..5e88d3e 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -56,7 +56,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group(br), skb, &vid)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; if (is_broadcast_ether_addr(dest)) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6d5ed79..a9d424e 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -32,7 +32,7 @@ static inline int should_deliver(const struct net_bridge_port *p, { struct net_bridge_vlan_group *vg; - vg = nbp_vlan_group(p); + vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; } @@ -80,7 +80,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { struct net_bridge_vlan_group *vg; - vg = nbp_vlan_group(to); + vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; @@ -112,7 +112,7 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) return; } - vg = nbp_vlan_group(to); + vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f5c5a45..f7fba74 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -44,7 +44,7 @@ static int br_pass_frame_up(struct sk_buff *skb) brstats->rx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); - vg = br_vlan_group(br); + vg = br_vlan_group_rcu(br); /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc modue when someone * may be running packet capture. @@ -140,7 +140,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!p || p->state == BR_STATE_DISABLED) goto drop; - if (!br_allowed_ingress(p->br, nbp_vlan_group(p), skb, &vid)) + if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) goto out; /* insert into forwarding database after filtering to avoid spoofing */ diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d792d1a..2ee8fd6 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -102,10 +102,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, rcu_read_lock(); if (br_port_exists(dev)) { p = br_port_get_rcu(dev); - vg = nbp_vlan_group(p); + vg = nbp_vlan_group_rcu(p); } else if (dev->priv_flags & IFF_EBRIDGE) { br = netdev_priv(dev); - vg = br_vlan_group(br); + vg = br_vlan_group_rcu(br); } num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ba0c67b..8835642 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -132,6 +132,7 @@ struct net_bridge_vlan_group { struct list_head vlan_list; u16 num_vlans; u16 pvid; + struct rcu_head rcu; }; struct net_bridge_fdb_entry @@ -229,7 +230,7 @@ struct net_bridge_port struct netpoll *np; #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group *vlgrp; + struct net_bridge_vlan_group __rcu *vlgrp; #endif }; @@ -337,7 +338,7 @@ struct net_bridge struct kobject *ifobj; u32 auto_cnt; #ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group *vlgrp; + struct net_bridge_vlan_group __rcu *vlgrp; u8 vlan_enabled; __be16 vlan_proto; u16 default_pvid; @@ -700,13 +701,25 @@ int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { - return br->vlgrp; + return rtnl_dereference(br->vlgrp); } static inline struct net_bridge_vlan_group *nbp_vlan_group( const struct net_bridge_port *p) { - return p->vlgrp; + return rtnl_dereference(p->vlgrp); +} + +static inline struct net_bridge_vlan_group *br_vlan_group_rcu( + const struct net_bridge *br) +{ + return rcu_dereference(br->vlgrp); +} + +static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( + const struct net_bridge_port *p) +{ + return rcu_dereference(p->vlgrp); } /* Since bridge now depends on 8021Q module, but the time bridge sees the @@ -853,6 +866,19 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group( { return NULL; } + +static inline struct net_bridge_vlan_group *br_vlan_group_rcu( + const struct net_bridge *br) +{ + return NULL; +} + +static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( + const struct net_bridge_port *p) +{ + return NULL; +} + #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index ad7e4f6..ffaa6d9 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -54,9 +54,9 @@ static void __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) struct net_bridge_vlan_group *vg; if (br_vlan_is_master(v)) - vg = v->br->vlgrp; + vg = br_vlan_group(v->br); else - vg = v->port->vlgrp; + vg = nbp_vlan_group(v->port); if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(vg, v->vid); @@ -91,11 +91,16 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, static void __vlan_add_list(struct net_bridge_vlan *v) { + struct net_bridge_vlan_group *vg; struct list_head *headp, *hpos; struct net_bridge_vlan *vent; - headp = br_vlan_is_master(v) ? &v->br->vlgrp->vlan_list : - &v->port->vlgrp->vlan_list; + if (br_vlan_is_master(v)) + vg = br_vlan_group(v->br); + else + vg = nbp_vlan_group(v->port); + + headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); if (v->vid < vent->vid) @@ -137,14 +142,16 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, */ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid) { + struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; - masterv = br_vlan_find(br->vlgrp, vid); + vg = br_vlan_group(br); + masterv = br_vlan_find(vg, vid); if (!masterv) { /* missing global ctx, create it now */ if (br_vlan_add(br, vid, 0)) return NULL; - masterv = br_vlan_find(br->vlgrp, vid); + masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; } @@ -155,11 +162,14 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid static void br_vlan_put_master(struct net_bridge_vlan *masterv) { + struct net_bridge_vlan_group *vg; + if (!br_vlan_is_master(masterv)) return; + vg = br_vlan_group(masterv->br); if (atomic_dec_and_test(&masterv->refcnt)) { - rhashtable_remove_fast(&masterv->br->vlgrp->vlan_hash, + rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); kfree_rcu(masterv, rcu); @@ -189,12 +199,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; - vg = br->vlgrp; + vg = br_vlan_group(br); } else { p = v->port; br = p->br; dev = p->dev; - vg = p->vlgrp; + vg = nbp_vlan_group(p); } if (p) { @@ -266,10 +276,10 @@ static int __vlan_del(struct net_bridge_vlan *v) int err = 0; if (br_vlan_is_master(v)) { - vg = v->br->vlgrp; + vg = br_vlan_group(v->br); } else { p = v->port; - vg = v->port->vlgrp; + vg = nbp_vlan_group(v->port); masterv = v->brvlan; } @@ -305,7 +315,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vlgrp) list_for_each_entry_safe(vlan, tmp, &vlgrp->vlan_list, vlist) __vlan_del(vlan); rhashtable_destroy(&vlgrp->vlan_hash); - kfree(vlgrp); + kfree_rcu(vlgrp, rcu); } struct sk_buff *br_handle_vlan(struct net_bridge *br, @@ -467,7 +477,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) if (!br->vlan_enabled) return true; - vg = p->vlgrp; + vg = nbp_vlan_group(p); if (!vg || !vg->num_vlans) return false; @@ -493,12 +503,14 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) { + struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); - vlan = br_vlan_find(br->vlgrp, vid); + vg = br_vlan_group(br); + vlan = br_vlan_find(vg, vid); if (vlan) { if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ @@ -513,7 +525,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) } atomic_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; - br->vlgrp->num_vlans++; + vg->num_vlans++; } __vlan_add_flags(vlan, flags); return 0; @@ -541,11 +553,13 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) */ int br_vlan_delete(struct net_bridge *br, u16 vid) { + struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); - v = br_vlan_find(br->vlgrp, vid); + vg = br_vlan_group(br); + v = br_vlan_find(vg, vid); if (!v || !br_vlan_is_brentry(v)) return -ENOENT; @@ -626,6 +640,7 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; + struct net_bridge_vlan_group *vg; __be16 oldproto; if (br->vlan_proto == proto) @@ -633,7 +648,8 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { - list_for_each_entry(vlan, &p->vlgrp->vlan_list, vlist) { + vg = nbp_vlan_group(p); + list_for_each_entry(vlan, &vg->vlan_list, vlist) { err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; @@ -647,19 +663,23 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ - list_for_each_entry(p, &br->port_list, list) - list_for_each_entry(vlan, &p->vlgrp->vlan_list, vlist) + list_for_each_entry(p, &br->port_list, list) { + vg = nbp_vlan_group(p); + list_for_each_entry(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, oldproto, vlan->vid); + } return 0; err_filt: - list_for_each_entry_continue_reverse(vlan, &p->vlgrp->vlan_list, vlist) + list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, proto, vlan->vid); - list_for_each_entry_continue_reverse(p, &br->port_list, list) - list_for_each_entry(vlan, &p->vlgrp->vlan_list, vlist) + list_for_each_entry_continue_reverse(p, &br->port_list, list) { + vg = nbp_vlan_group(p); + list_for_each_entry(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, proto, vlan->vid); + } return err; } @@ -703,11 +723,11 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) /* Disable default_pvid on all ports where it is still * configured. */ - if (vlan_default_pvid(br->vlgrp, pvid)) + if (vlan_default_pvid(br_vlan_group(br), pvid)) br_vlan_delete(br, pvid); list_for_each_entry(p, &br->port_list, list) { - if (vlan_default_pvid(p->vlgrp, pvid)) + if (vlan_default_pvid(nbp_vlan_group(p), pvid)) nbp_vlan_delete(p, pvid); } @@ -717,6 +737,7 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) { const struct net_bridge_vlan *pvent; + struct net_bridge_vlan_group *vg; struct net_bridge_port *p; u16 old_pvid; int err = 0; @@ -737,8 +758,9 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) /* Update default_pvid config only if we do not conflict with * user configuration. */ - pvent = br_vlan_find(br->vlgrp, pvid); - if ((!old_pvid || vlan_default_pvid(br->vlgrp, old_pvid)) && + vg = br_vlan_group(br); + pvent = br_vlan_find(vg, pvid); + if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | @@ -754,9 +776,10 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) /* Update default_pvid config only if we do not conflict with * user configuration. */ + vg = nbp_vlan_group(p); if ((old_pvid && - !vlan_default_pvid(p->vlgrp, old_pvid)) || - br_vlan_find(p->vlgrp, pvid)) + !vlan_default_pvid(vg, old_pvid)) || + br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, @@ -825,17 +848,19 @@ unlock: int br_vlan_init(struct net_bridge *br) { + struct net_bridge_vlan_group *vg; int ret = -ENOMEM; - br->vlgrp = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); - if (!br->vlgrp) + vg = kzalloc(sizeof(*vg), GFP_KERNEL); + if (!vg) goto out; - ret = rhashtable_init(&br->vlgrp->vlan_hash, &br_vlan_rht_params); + ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; - INIT_LIST_HEAD(&br->vlgrp->vlan_list); + INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; + rcu_assign_pointer(br->vlgrp, vg); ret = br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY); @@ -846,9 +871,9 @@ out: return ret; err_vlan_add: - rhashtable_destroy(&br->vlgrp->vlan_hash); + rhashtable_destroy(&vg->vlan_hash); err_rhtbl: - kfree(br->vlgrp); + kfree(vg); goto out; } @@ -866,9 +891,7 @@ int nbp_vlan_init(struct net_bridge_port *p) if (ret) goto err_rhtbl; INIT_LIST_HEAD(&vg->vlan_list); - /* Make sure everything's committed before publishing vg */ - smp_wmb(); - p->vlgrp = vg; + rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | @@ -897,7 +920,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) ASSERT_RTNL(); - vlan = br_vlan_find(port->vlgrp, vid); + vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { __vlan_add_flags(vlan, flags); return 0; @@ -925,7 +948,7 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) ASSERT_RTNL(); - v = br_vlan_find(port->vlgrp, vid); + v = br_vlan_find(nbp_vlan_group(port), vid); if (!v) return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); @@ -936,12 +959,14 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) void nbp_vlan_flush(struct net_bridge_port *port) { + struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; ASSERT_RTNL(); - list_for_each_entry(vlan, &port->vlgrp->vlan_list, vlist) + vg = nbp_vlan_group(port); + list_for_each_entry(vlan, &vg->vlan_list, vlist) vlan_vid_del(port->dev, port->br->vlan_proto, vlan->vid); - __vlan_flush(nbp_vlan_group(port)); + __vlan_flush(vg); } -- cgit v1.1 From e9c953eff7f0ec69a52cfa87b912ab48902a0314 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 12 Oct 2015 21:47:03 +0200 Subject: bridge: vlan: use rcu for vlan_list traversal in br_fill_ifinfo br_fill_ifinfo is called by br_ifinfo_notify which can be called from many contexts with different locks held, sometimes it relies upon bridge's spinlock only which is a problem for the vlan code, so use explicitly rcu for that to avoid problems. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ee8fd6..94b4de8 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -253,7 +253,7 @@ static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, * if vlaninfo represents a range */ pvid = br_get_pvid(vg); - list_for_each_entry(v, &vg->vlan_list, vlist) { + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; if (!br_vlan_should_use(v)) continue; @@ -303,7 +303,7 @@ static int br_fill_ifvlaninfo(struct sk_buff *skb, u16 pvid; pvid = br_get_pvid(vg); - list_for_each_entry(v, &vg->vlan_list, vlist) { + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; @@ -386,22 +386,27 @@ static int br_fill_ifinfo(struct sk_buff *skb, struct nlattr *af; int err; + /* RCU needed because of the VLAN locking rules (rcu || rtnl) */ + rcu_read_lock(); if (port) - vg = nbp_vlan_group(port); + vg = nbp_vlan_group_rcu(port); else - vg = br_vlan_group(br); + vg = br_vlan_group_rcu(br); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans) { + rcu_read_unlock(); goto done; - + } af = nla_nest_start(skb, IFLA_AF_SPEC); - if (!af) + if (!af) { + rcu_read_unlock(); goto nla_put_failure; - + } if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) err = br_fill_ifvlaninfo_compressed(skb, vg); else err = br_fill_ifvlaninfo(skb, vg); + rcu_read_unlock(); if (err) goto nla_put_failure; nla_nest_end(skb, af); -- cgit v1.1 From b8d02c3cace37393bf9ff0a9eaa1ee39cda1d259 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 12 Oct 2015 21:47:04 +0200 Subject: bridge: vlan: drop unnecessary flush code As Ido Schimmel pointed out the vlan_vid_del() code in nbp_vlan_flush is unnecessary (and is actually a remnant of the old vlan code) so we can remove it. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index ffaa6d9..85e6756 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -959,14 +959,7 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) void nbp_vlan_flush(struct net_bridge_port *port) { - struct net_bridge_vlan_group *vg; - struct net_bridge_vlan *vlan; - ASSERT_RTNL(); - vg = nbp_vlan_group(port); - list_for_each_entry(vlan, &vg->vlan_list, vlist) - vlan_vid_del(port->dev, port->br->vlan_proto, vlan->vid); - - __vlan_flush(vg); + __vlan_flush(nbp_vlan_group(port)); } -- cgit v1.1 From f409d0ed87d2721e1099ce36266e98c5aea2d486 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 12 Oct 2015 21:47:05 +0200 Subject: bridge: vlan: move back vlan_flush Ido Schimmel reported a problem with switchdev devices because of the order change of del_nbp operations, more specifically the move of nbp_vlan_flush() which deletes all vlans and frees vlgrp after the rx_handler has been unregistered. So in order to fix this move vlan_flush back where it was and make it destroy the rhtable after NULLing vlgrp and waiting a grace period to make sure noone can see it. Reported-by: Ido Schimmel Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/bridge/br_if.c | 3 +-- net/bridge/br_private.h | 1 - net/bridge/br_vlan.c | 31 ++++++++++++++++++++++++------- 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 934cae9..45e4757 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -248,6 +248,7 @@ static void del_nbp(struct net_bridge_port *p) list_del_rcu(&p->list); + nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); @@ -256,8 +257,6 @@ static void del_nbp(struct net_bridge_port *p) dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); - /* use the synchronize_rcu done by netdev_rx_handler_unregister */ - nbp_vlan_flush(p); br_multicast_del_port(p); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8835642..216018c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -132,7 +132,6 @@ struct net_bridge_vlan_group { struct list_head vlan_list; u16 num_vlans; u16 pvid; - struct rcu_head rcu; }; struct net_bridge_fdb_entry diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 85e6756..5f0d0cc 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -307,15 +307,20 @@ out: return err; } -static void __vlan_flush(struct net_bridge_vlan_group *vlgrp) +static void __vlan_group_free(struct net_bridge_vlan_group *vg) +{ + WARN_ON(!list_empty(&vg->vlan_list)); + rhashtable_destroy(&vg->vlan_hash); + kfree(vg); +} + +static void __vlan_flush(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; - __vlan_delete_pvid(vlgrp, vlgrp->pvid); - list_for_each_entry_safe(vlan, tmp, &vlgrp->vlan_list, vlist) + __vlan_delete_pvid(vg, vg->pvid); + list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) __vlan_del(vlan); - rhashtable_destroy(&vlgrp->vlan_hash); - kfree_rcu(vlgrp, rcu); } struct sk_buff *br_handle_vlan(struct net_bridge *br, @@ -571,9 +576,15 @@ int br_vlan_delete(struct net_bridge *br, u16 vid) void br_vlan_flush(struct net_bridge *br) { + struct net_bridge_vlan_group *vg; + ASSERT_RTNL(); - __vlan_flush(br_vlan_group(br)); + vg = br_vlan_group(br); + __vlan_flush(vg); + RCU_INIT_POINTER(br->vlgrp, NULL); + synchronize_rcu(); + __vlan_group_free(vg); } struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) @@ -959,7 +970,13 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) void nbp_vlan_flush(struct net_bridge_port *port) { + struct net_bridge_vlan_group *vg; + ASSERT_RTNL(); - __vlan_flush(nbp_vlan_group(port)); + vg = nbp_vlan_group(port); + __vlan_flush(vg); + RCU_INIT_POINTER(port->vlgrp, NULL); + synchronize_rcu(); + __vlan_group_free(vg); } -- cgit v1.1 From ba61a8d9d780980e8284355a0be750897e7af212 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Sep 2015 13:26:42 +0200 Subject: can: avoid using timeval for uapi The can subsystem communicates with user space using a bcm_msg_head header, which contains two timestamps. This is problematic for multiple reasons: a) The structure layout is currently incompatible between 64-bit user space and 32-bit user space, and cannot work in compat mode (other than x32). b) The timeval structure layout will change in 32-bit user space when we fix the y2038 overflow problem by redefining time_t to 64-bit, making new 32-bit user space incompatible with the current kernel interface. Cars last a long time and often use old kernels, so the actual users of this code are the most likely ones to migrate to y2038 safe user space. This tries to work around part of the problem by changing the publicly visible user interface in the header, but not the binary interface. Fortunately, the values passed around in the structure are relative times and do not actually suffer from the y2038 overflow, so 32-bit is enough here. We replace the use of 'struct timeval' with a newly defined 'struct bcm_timeval' that uses the exact same binary layout as before and that still suffers from problem a) but not problem b). The downside of this approach is that any user space program that currently assigns a timeval structure to these members rather than writing the tv_sec/tv_usec portions individually will suffer a compile-time error when built with an updated kernel header. Fixing this error makes it work fine with old and new headers though. We could address problem a) by using '__u32' or 'int' members rather than 'long', but that would have a more significant downside in also breaking support for all existing 64-bit user binaries that might be using this interface, which is likely not acceptable. Signed-off-by: Arnd Bergmann Acked-by: Oliver Hartkopp Cc: linux-can@vger.kernel.org Cc: linux-api@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index a1ba687..6863310 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -96,7 +96,7 @@ struct bcm_op { canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; - struct timeval ival1, ival2; + struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; @@ -131,6 +131,11 @@ static inline struct bcm_sock *bcm_sk(const struct sock *sk) return (struct bcm_sock *)sk; } +static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) +{ + return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); +} + #define CFSIZ sizeof(struct can_frame) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) @@ -953,8 +958,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; - op->kt_ival1 = timeval_to_ktime(msg_head->ival1); - op->kt_ival2 = timeval_to_ktime(msg_head->ival2); + op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); + op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64) @@ -1134,8 +1139,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; - op->kt_ival1 = timeval_to_ktime(msg_head->ival1); - op->kt_ival2 = timeval_to_ktime(msg_head->ival2); + op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); + op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1.tv64) -- cgit v1.1 From 4bdc3d66147b3a623b32216a45431d0cff005f50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Oct 2015 17:12:54 -0700 Subject: tcp/dccp: fix behavior of stale SYN_RECV request sockets When a TCP/DCCP listener is closed, its pending SYN_RECV request sockets become stale, meaning 3WHS can not complete. But current behavior is wrong : incoming packets finding such stale sockets are dropped. We need instead to cleanup the request socket and perform another lookup : - Incoming ACK will give a RST answer, - SYN rtx might find another listener if available. - We expedite cleanup of request sockets and old listener socket. Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 15 +++++++-------- net/dccp/ipv6.c | 15 +++++++-------- net/ipv4/tcp_ipv4.c | 7 ++++++- net/ipv6/tcp_ipv6.c | 7 ++++++- 4 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8e99681..0dcf196 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -799,15 +799,10 @@ static int dccp_v4_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq); } - /* Step 2: - * Look up flow ID in table and get corresponding socket */ +lookup: sk = __inet_lookup_skb(&dccp_hashinfo, skb, dh->dccph_sport, dh->dccph_dport); - /* - * Step 2: - * If no socket ... - */ - if (sk == NULL) { + if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; @@ -830,8 +825,12 @@ static int dccp_v4_rcv(struct sk_buff *skb) struct sock *nsk = NULL; sk = req->rsk_listener; - if (sk->sk_state == DCCP_LISTEN) + if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index aed314f..6883193 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -656,16 +656,11 @@ static int dccp_v6_rcv(struct sk_buff *skb) else DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - /* Step 2: - * Look up flow ID in table and get corresponding socket */ +lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); - /* - * Step 2: - * If no socket ... - */ - if (sk == NULL) { + if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; @@ -688,8 +683,12 @@ static int dccp_v6_rcv(struct sk_buff *skb) struct sock *nsk = NULL; sk = req->rsk_listener; - if (sk->sk_state == DCCP_LISTEN) + if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddb1983..1ff0923 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1572,6 +1572,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; +lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1587,8 +1588,12 @@ process: sk = req->rsk_listener; if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; - if (sk->sk_state == TCP_LISTEN) + if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2887c84..7ce1c57 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1363,6 +1363,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); +lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, inet6_iif(skb)); if (!sk) @@ -1382,8 +1383,12 @@ process: reqsk_put(req); goto discard_it; } - if (sk->sk_state == TCP_LISTEN) + if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; -- cgit v1.1 From 02a6d6136fa2a17f400a030829a6435556b3e65b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 14 Oct 2015 14:25:53 +0200 Subject: Revert "ipv4/icmp: redirect messages can use the ingress daddr as source" Revert the commit e2ca690b657f ("ipv4/icmp: redirect messages can use the ingress daddr as source"), which tried to introduce a more suitable behaviour for ICMP redirect messages generated by VRRP routers. However RFC 5798 section 8.1.1 states: The IPv4 source address of an ICMP redirect should be the address that the end-host used when making its next-hop routing decision. while said commit used the generating packet destination address, which do not match the above and in most cases leads to no redirect packets to be generated. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 9 +-------- net/ipv4/sysctl_net_ipv4.c | 7 ------- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f3c356b..36e2697 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -659,9 +659,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) */ saddr = iph->daddr; - if (!((type == ICMP_REDIRECT) && - net->ipv4.sysctl_icmp_redirects_use_orig_daddr) && - !(rt->rt_flags & RTCF_LOCAL)) { + if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; rcu_read_lock(); @@ -1224,11 +1222,6 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; - /* Control paramerer - use the daddr of originating packets as saddr - * in redirect messages? - */ - net->ipv4.sysctl_icmp_redirects_use_orig_daddr = 0; - return 0; fail: diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 30a531c..894da3a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -818,13 +818,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .procname = "icmp_redirects_use_orig_daddr", - .data = &init_net.ipv4.sysctl_icmp_redirects_use_orig_daddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), -- cgit v1.1 From c2f34a65a61cd1ace3b53c93e8b38d2f79f4ff0d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 05:58:38 -0700 Subject: tcp/dccp: fix potential NULL deref in __inet_inherit_port() As we no longer hold listener lock in fast path, it is possible that a child is created right after listener freed its bound port, if a close() is done while incoming packets are processed. __inet_inherit_port() must detect this and return an error, so that caller can free the child earlier. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 08643a3..958728a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -137,6 +137,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (unlikely(!tb)) { + spin_unlock(&head->lock); + return -ENOENT; + } if (tb->port != port) { /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption -- cgit v1.1 From f985c65c908f6b26c30019a83dc5ea295f5fcf62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 06:16:49 -0700 Subject: tcp: avoid spurious SYN flood detection at listen() time At listen() time, there is a small window where listener is visible with a zero backlog, triggering a spurious "Possible SYN flooding on port" message. Nothing prevents us from setting the correct backlog. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 514b9e9..ba9ec9a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -727,14 +727,14 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); -int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = 0; + sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); -- cgit v1.1 From 793f40147e82cdedc80971fa7f5596d6ed1e555e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:48 +0200 Subject: switchdev: introduce switchdev deferred ops infrastructure Introduce infrastructure which will be used internally to defer ops. Note that the deferred ops are queued up and either are processed by scheduled work or explicitly by user calling deferred_process function. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index b8aaf820..5e64b59 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -92,6 +93,85 @@ static void switchdev_trans_items_warn_destroy(struct net_device *dev, switchdev_trans_items_destroy(trans); } +static LIST_HEAD(deferred); +static DEFINE_SPINLOCK(deferred_lock); + +typedef void switchdev_deferred_func_t(struct net_device *dev, + const void *data); + +struct switchdev_deferred_item { + struct list_head list; + struct net_device *dev; + switchdev_deferred_func_t *func; + unsigned long data[0]; +}; + +static struct switchdev_deferred_item *switchdev_deferred_dequeue(void) +{ + struct switchdev_deferred_item *dfitem; + + spin_lock_bh(&deferred_lock); + if (list_empty(&deferred)) { + dfitem = NULL; + goto unlock; + } + dfitem = list_first_entry(&deferred, + struct switchdev_deferred_item, list); + list_del(&dfitem->list); +unlock: + spin_unlock_bh(&deferred_lock); + return dfitem; +} + +/** + * switchdev_deferred_process - Process ops in deferred queue + * + * Called to flush the ops currently queued in deferred ops queue. + * rtnl_lock must be held. + */ +void switchdev_deferred_process(void) +{ + struct switchdev_deferred_item *dfitem; + + ASSERT_RTNL(); + + while ((dfitem = switchdev_deferred_dequeue())) { + dfitem->func(dfitem->dev, dfitem->data); + dev_put(dfitem->dev); + kfree(dfitem); + } +} +EXPORT_SYMBOL_GPL(switchdev_deferred_process); + +static void switchdev_deferred_process_work(struct work_struct *work) +{ + rtnl_lock(); + switchdev_deferred_process(); + rtnl_unlock(); +} + +static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work); + +static int switchdev_deferred_enqueue(struct net_device *dev, + const void *data, size_t data_len, + switchdev_deferred_func_t *func) +{ + struct switchdev_deferred_item *dfitem; + + dfitem = kmalloc(sizeof(*dfitem) + data_len, GFP_ATOMIC); + if (!dfitem) + return -ENOMEM; + dfitem->dev = dev; + dfitem->func = func; + memcpy(dfitem->data, data, data_len); + dev_hold(dev); + spin_lock_bh(&deferred_lock); + list_add_tail(&dfitem->list, &deferred); + spin_unlock_bh(&deferred_lock); + schedule_work(&deferred_process_work); + return 0; +} + /** * switchdev_port_attr_get - Get port attribute * -- cgit v1.1 From f7fadf3047d005d17376da65aa9e5734f45a77d4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:49 +0200 Subject: switchdev: make struct switchdev_attr parameter const for attr_set calls Signed-off-by: Jiri Pirko Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- net/switchdev/switchdev.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 43d7342..84cd863 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -453,7 +453,7 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) } static int dsa_slave_port_attr_set(struct net_device *dev, - struct switchdev_attr *attr, + const struct switchdev_attr *attr, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 5e64b59..23b4e5b 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -215,7 +215,7 @@ int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) EXPORT_SYMBOL_GPL(switchdev_port_attr_get); static int __switchdev_port_attr_set(struct net_device *dev, - struct switchdev_attr *attr, + const struct switchdev_attr *attr, struct switchdev_trans *trans) { const struct switchdev_ops *ops = dev->switchdev_ops; @@ -274,7 +274,7 @@ static void switchdev_port_attr_set_work(struct work_struct *work) } static int switchdev_port_attr_set_defer(struct net_device *dev, - struct switchdev_attr *attr) + const struct switchdev_attr *attr) { struct switchdev_attr_set_work *asw; @@ -303,7 +303,8 @@ static int switchdev_port_attr_set_defer(struct net_device *dev, * system is not left in a partially updated state due to * failure from driver/device. */ -int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) +int switchdev_port_attr_set(struct net_device *dev, + const struct switchdev_attr *attr) { struct switchdev_trans trans; int err; -- cgit v1.1 From 0bc05d585d381c30de3fdf955730df31593d2101 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:50 +0200 Subject: switchdev: allow caller to explicitly request attr_set as deferred Caller should know if he can call attr_set directly (when holding RTNL) or if he has to defer the att_set processing for later. This also allows drivers to sleep inside attr_set and report operation status back to switchdev core. Switchdev core then warns if status is not ok, instead of silent errors happening in drivers. Benefit from newly introduced switchdev deferred ops infrastructure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 3 +- net/switchdev/switchdev.c | 108 ++++++++++++++++++---------------------------- 2 files changed, 45 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index db6d243de..80c34d7 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, + .flags = SWITCHDEV_F_DEFER, .u.stp_state = state, }; int err; p->state = state; err = switchdev_port_attr_set(p->dev, &attr); - if (err && err != -EOPNOTSUPP) + if (err) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 23b4e5b..007b8f4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -250,75 +250,12 @@ done: return err; } -struct switchdev_attr_set_work { - struct work_struct work; - struct net_device *dev; - struct switchdev_attr attr; -}; - -static void switchdev_port_attr_set_work(struct work_struct *work) -{ - struct switchdev_attr_set_work *asw = - container_of(work, struct switchdev_attr_set_work, work); - int err; - - rtnl_lock(); - err = switchdev_port_attr_set(asw->dev, &asw->attr); - if (err && err != -EOPNOTSUPP) - netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n", - err, asw->attr.id); - rtnl_unlock(); - - dev_put(asw->dev); - kfree(work); -} - -static int switchdev_port_attr_set_defer(struct net_device *dev, - const struct switchdev_attr *attr) -{ - struct switchdev_attr_set_work *asw; - - asw = kmalloc(sizeof(*asw), GFP_ATOMIC); - if (!asw) - return -ENOMEM; - - INIT_WORK(&asw->work, switchdev_port_attr_set_work); - - dev_hold(dev); - asw->dev = dev; - memcpy(&asw->attr, attr, sizeof(asw->attr)); - - schedule_work(&asw->work); - - return 0; -} - -/** - * switchdev_port_attr_set - Set port attribute - * - * @dev: port device - * @attr: attribute to set - * - * Use a 2-phase prepare-commit transaction model to ensure - * system is not left in a partially updated state due to - * failure from driver/device. - */ -int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) +static int switchdev_port_attr_set_now(struct net_device *dev, + const struct switchdev_attr *attr) { struct switchdev_trans trans; int err; - if (!rtnl_is_locked()) { - /* Running prepare-commit transaction across stacked - * devices requires nothing moves, so if rtnl_lock is - * not held, schedule a worker thread to hold rtnl_lock - * while setting attr. - */ - - return switchdev_port_attr_set_defer(dev, attr); - } - switchdev_trans_init(&trans); /* Phase I: prepare for attr set. Driver/device should fail @@ -355,6 +292,47 @@ int switchdev_port_attr_set(struct net_device *dev, return err; } + +static void switchdev_port_attr_set_deferred(struct net_device *dev, + const void *data) +{ + const struct switchdev_attr *attr = data; + int err; + + err = switchdev_port_attr_set_now(dev, attr); + if (err && err != -EOPNOTSUPP) + netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", + err, attr->id); +} + +static int switchdev_port_attr_set_defer(struct net_device *dev, + const struct switchdev_attr *attr) +{ + return switchdev_deferred_enqueue(dev, attr, sizeof(*attr), + switchdev_port_attr_set_deferred); +} + +/** + * switchdev_port_attr_set - Set port attribute + * + * @dev: port device + * @attr: attribute to set + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. + * + * rtnl_lock must be held and must not be in atomic section, + * in case SWITCHDEV_F_DEFER flag is not set. + */ +int switchdev_port_attr_set(struct net_device *dev, + const struct switchdev_attr *attr) +{ + if (attr->flags & SWITCHDEV_F_DEFER) + return switchdev_port_attr_set_defer(dev, attr); + ASSERT_RTNL(); + return switchdev_port_attr_set_now(dev, attr); +} EXPORT_SYMBOL_GPL(switchdev_port_attr_set); static int __switchdev_port_obj_add(struct net_device *dev, -- cgit v1.1 From 850d0cbc9171f63f0418afffb0d89a84db927851 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:51 +0200 Subject: switchdev: remove pointers from switchdev objects When object is used in deferred work, we cannot use pointers in switchdev object structures because the memory they point at may be already used by someone else. So rather do local copy of the value. Signed-off-by: Jiri Pirko Acked-by: Scott Feldman Reviewed-by: John Fastabend Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 2 +- net/dsa/slave.c | 2 +- net/switchdev/switchdev.c | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index f43ce05..f5e7da0 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -135,10 +135,10 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) { struct switchdev_obj_port_fdb fdb = { .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, - .addr = f->addr.addr, .vid = f->vlan_id, }; + ether_addr_copy(fdb.addr, f->addr.addr); switchdev_port_obj_del(f->dst->dev, &fdb.obj); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 84cd863..b0b8da0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -393,7 +393,7 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, if (ret < 0) break; - fdb->addr = addr; + ether_addr_copy(fdb->addr, addr); fdb->vid = vid; fdb->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 007b8f4..5963d7a 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -891,10 +892,10 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { struct switchdev_obj_port_fdb fdb = { .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, - .addr = addr, .vid = vid, }; + ether_addr_copy(fdb.addr, addr); return switchdev_port_obj_add(dev, &fdb.obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); @@ -916,10 +917,10 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], { struct switchdev_obj_port_fdb fdb = { .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, - .addr = addr, .vid = vid, }; + ether_addr_copy(fdb.addr, addr); return switchdev_port_obj_del(dev, &fdb.obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); @@ -1081,7 +1082,6 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, .dst = dst, .dst_len = dst_len, - .fi = fi, .tos = tos, .type = type, .nlflags = nlflags, @@ -1090,6 +1090,8 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, struct net_device *dev; int err = 0; + memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi)); + /* Don't offload route if using custom ip rules or if * IPv4 FIB offloading has been disabled completely. */ @@ -1133,7 +1135,6 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, .dst = dst, .dst_len = dst_len, - .fi = fi, .tos = tos, .type = type, .nlflags = 0, @@ -1142,6 +1143,8 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, struct net_device *dev; int err = 0; + memcpy(&ipv4_fib.fi, fi, sizeof(ipv4_fib.fi)); + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) return 0; -- cgit v1.1 From 4d429c5ddc5128fccd3048059ae26bb39f0d8284 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:52 +0200 Subject: switchdev: introduce possibility to defer obj_add/del Similar to the attr usecase, the caller knows if he is holding RTNL and is in atomic section. So let the called to decide the correct call variant. This allows drivers to sleep inside their ops and wait for hw to get the operation status. Then the status is propagated into switchdev core. This avoids silent errors in drivers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 100 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 5963d7a..eac68c4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -362,21 +362,8 @@ static int __switchdev_port_obj_add(struct net_device *dev, return err; } -/** - * switchdev_port_obj_add - Add port object - * - * @dev: port device - * @id: object ID - * @obj: object to add - * - * Use a 2-phase prepare-commit transaction model to ensure - * system is not left in a partially updated state due to - * failure from driver/device. - * - * rtnl_lock must be held. - */ -int switchdev_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj) +static int switchdev_port_obj_add_now(struct net_device *dev, + const struct switchdev_obj *obj) { struct switchdev_trans trans; int err; @@ -418,18 +405,53 @@ int switchdev_port_obj_add(struct net_device *dev, return err; } -EXPORT_SYMBOL_GPL(switchdev_port_obj_add); + +static void switchdev_port_obj_add_deferred(struct net_device *dev, + const void *data) +{ + const struct switchdev_obj *obj = data; + int err; + + err = switchdev_port_obj_add_now(dev, obj); + if (err && err != -EOPNOTSUPP) + netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", + err, obj->id); +} + +static int switchdev_port_obj_add_defer(struct net_device *dev, + const struct switchdev_obj *obj) +{ + return switchdev_deferred_enqueue(dev, obj, sizeof(*obj), + switchdev_port_obj_add_deferred); +} /** - * switchdev_port_obj_del - Delete port object + * switchdev_port_obj_add - Add port object * * @dev: port device * @id: object ID - * @obj: object to delete + * @obj: object to add + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. + * + * rtnl_lock must be held and must not be in atomic section, + * in case SWITCHDEV_F_DEFER flag is not set. */ -int switchdev_port_obj_del(struct net_device *dev, +int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj) { + if (obj->flags & SWITCHDEV_F_DEFER) + return switchdev_port_obj_add_defer(dev, obj); + ASSERT_RTNL(); + return switchdev_port_obj_add_now(dev, obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_add); + +static int switchdev_port_obj_del_now(struct net_device *dev, + const struct switchdev_obj *obj) +{ const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; struct list_head *iter; @@ -444,13 +466,51 @@ int switchdev_port_obj_del(struct net_device *dev, */ netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_obj_del(lower_dev, obj); + err = switchdev_port_obj_del_now(lower_dev, obj); if (err) break; } return err; } + +static void switchdev_port_obj_del_deferred(struct net_device *dev, + const void *data) +{ + const struct switchdev_obj *obj = data; + int err; + + err = switchdev_port_obj_del_now(dev, obj); + if (err && err != -EOPNOTSUPP) + netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", + err, obj->id); +} + +static int switchdev_port_obj_del_defer(struct net_device *dev, + const struct switchdev_obj *obj) +{ + return switchdev_deferred_enqueue(dev, obj, sizeof(*obj), + switchdev_port_obj_del_deferred); +} + +/** + * switchdev_port_obj_del - Delete port object + * + * @dev: port device + * @id: object ID + * @obj: object to delete + * + * rtnl_lock must be held and must not be in atomic section, + * in case SWITCHDEV_F_DEFER flag is not set. + */ +int switchdev_port_obj_del(struct net_device *dev, + const struct switchdev_obj *obj) +{ + if (obj->flags & SWITCHDEV_F_DEFER) + return switchdev_port_obj_del_defer(dev, obj); + ASSERT_RTNL(); + return switchdev_port_obj_del_now(dev, obj); +} EXPORT_SYMBOL_GPL(switchdev_port_obj_del); /** -- cgit v1.1 From 56607386e80cc7ce923592e115a3492485b47c72 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:53 +0200 Subject: bridge: defer switchdev fdb del call in fdb_del_external_learn Since spinlock is held here, defer the switchdev operation. Also, ensure that defered switchdev ops are processed before port master device is unlinked. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 5 ++++- net/bridge/br_if.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index f5e7da0..c88bd8e 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -134,7 +134,10 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) { struct switchdev_obj_port_fdb fdb = { - .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, + .obj = { + .id = SWITCHDEV_OBJ_ID_PORT_FDB, + .flags = SWITCHDEV_F_DEFER, + }, .vid = f->vlan_id, }; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 45e4757..ec02f586 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "br_private.h" @@ -250,6 +251,8 @@ static void del_nbp(struct net_bridge_port *p) nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); + switchdev_deferred_process(); + nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); -- cgit v1.1 From 771acac2ffa5957b91e881908cd4c9657978a209 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 14 Oct 2015 19:40:55 +0200 Subject: switchdev: assert rtnl mutex when going over lower netdevs netdev_for_each_lower_dev has to be called with rtnl mutex held. So better enforce it in switchdev functions. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index eac68c4..73e3895 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -520,6 +520,8 @@ EXPORT_SYMBOL_GPL(switchdev_port_obj_del); * @id: object ID * @obj: object to dump * @cb: function to call with a filled object + * + * rtnl_lock must be held. */ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, switchdev_obj_dump_cb_t *cb) @@ -529,6 +531,8 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, struct list_head *iter; int err = -EOPNOTSUPP; + ASSERT_RTNL(); + if (ops && ops->switchdev_port_obj_dump) return ops->switchdev_port_obj_dump(dev, obj, cb); @@ -1097,6 +1101,8 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) struct net_device *dev = NULL; int nhsel; + ASSERT_RTNL(); + /* For this route, all nexthop devs must be on the same switch. */ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { @@ -1327,10 +1333,11 @@ void switchdev_port_fwd_mark_set(struct net_device *dev, u32 mark = dev->ifindex; u32 reset_mark = 0; - if (group_dev && joining) { - mark = switchdev_port_fwd_mark_get(dev, group_dev); - } else if (group_dev && !joining) { - if (dev->offload_fwd_mark == mark) + if (group_dev) { + ASSERT_RTNL(); + if (joining) + mark = switchdev_port_fwd_mark_get(dev, group_dev); + else if (dev->offload_fwd_mark == mark) /* Ohoh, this port was the mark reference port, * but it's leaving the group, so reset the * mark for the remaining ports in the group. -- cgit v1.1 From 9945e8043ef9273cfb633d930e2a5a9116009b09 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:40 -0400 Subject: tipc: limit usage of temporary skb list during packet reception During packet reception, the function tipc_link_rcv() adds its accepted packets to a temporary buffer queue, before finally splicing this queue into the lock protected input queue that will be delivered up to the socket layer. The purpose is to reduce potential contention on the input queue lock. However, since the vast majority of packets arrive in sequence, they will anyway be added one by one to the input queue, and the use of the temporary queue becomes a sub-optimization. The only case where this queue makes sense is when unpacking buffers from a bundle packet; here we want to avoid dozens of small buffers to be added individually to the lock-protected input queue in a tight loop. In this commit, we remove the general usage of the temporary queue, and keep it only for the packet unbundling case. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 75db07c..11f7429 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -953,7 +953,7 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb, case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: case CONN_MANAGER: - __skb_queue_tail(inputq, skb); + skb_queue_tail(inputq, skb); return true; case NAME_DISTRIBUTOR: node->bclink.recv_permitted = true; @@ -982,6 +982,7 @@ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); struct sk_buff **reasm_skb = &l->reasm_buf; struct sk_buff *iskb; + struct sk_buff_head tmpq; int usr = msg_user(hdr); int rc = 0; int pos = 0; @@ -1006,10 +1007,12 @@ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, } if (usr == MSG_BUNDLER) { + skb_queue_head_init(&tmpq); l->stats.recv_bundles++; l->stats.recv_bundled += msg_msgcnt(hdr); while (tipc_msg_extract(skb, &iskb, &pos)) - tipc_data_input(l, iskb, inputq); + tipc_data_input(l, iskb, &tmpq); + tipc_skb_queue_splice_tail(&tmpq, inputq); return 0; } else if (usr == MSG_FRAGMENTER) { l->stats.recv_fragments++; @@ -1053,13 +1056,10 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct sk_buff_head *arrvq = &l->deferdq; - struct sk_buff_head tmpq; struct tipc_msg *hdr; u16 seqno, rcv_nxt; int rc = 0; - __skb_queue_head_init(&tmpq); - if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) { if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV)) tipc_link_build_proto_msg(l, STATE_MSG, 0, @@ -1114,8 +1114,8 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Packet can be delivered */ l->rcv_nxt++; l->stats.recv_info++; - if (unlikely(!tipc_data_input(l, skb, &tmpq))) - rc = tipc_link_input(l, skb, &tmpq); + if (unlikely(!tipc_data_input(l, skb, l->inputq))) + rc = tipc_link_input(l, skb, l->inputq); /* Ack at regular intervals */ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { @@ -1126,7 +1126,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, } } exit: - tipc_skb_queue_splice_tail(&tmpq, l->inputq); return rc; } -- cgit v1.1 From f9aa358a8109f9f33e96c3a7efb9a07631670294 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:41 -0400 Subject: tipc: simplify tipc_link_rcv() reception loop Currently, all packets received in tipc_link_rcv() are unconditionally added to the packet deferred queue, whereafter that queue is walked and all its buffers evaluated for delivery. This is both non-optimal and and makes the queue sorting function unnecessary complex. This commit changes the loop so that an arrived packet is evaluated first, and added to the deferred queue only when a sequence number gap is discovered. A non-empty deferred queue is walked until it is empty or until its head's sequence number doesn't fit. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 84 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 11f7429..8e23ab5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1047,44 +1047,55 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) return released; } +/* tipc_link_build_ack_msg: prepare link acknowledge message for transmission + */ +void tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + l->rcv_unacked = 0; + l->stats.sent_acks++; + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); +} + +/* tipc_link_build_nack_msg: prepare link nack message for transmission + */ +static void tipc_link_build_nack_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + u32 def_cnt = ++l->stats.deferred_recv; + + if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); +} + /* tipc_link_rcv - process TIPC packets/messages arriving from off-node - * @link: the link that should handle the message + * @l: the link that should handle the message * @skb: TIPC packet * @xmitq: queue to place packets to be sent after this call */ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { - struct sk_buff_head *arrvq = &l->deferdq; + struct sk_buff_head *defq = &l->deferdq; struct tipc_msg *hdr; u16 seqno, rcv_nxt; int rc = 0; - if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) { - if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, - 0, 0, 0, xmitq); - return rc; - } - - while ((skb = skb_peek(arrvq))) { + do { hdr = buf_msg(skb); + seqno = msg_seqno(hdr); + rcv_nxt = l->rcv_nxt; /* Verify and update link state */ - if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) { - __skb_dequeue(arrvq); - rc = tipc_link_proto_rcv(l, skb, xmitq); - continue; - } + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) + return tipc_link_proto_rcv(l, skb, xmitq); if (unlikely(!link_is_up(l))) { rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); - if (!link_is_up(l)) { - kfree_skb(__skb_dequeue(arrvq)); - goto exit; - } + if (!link_is_up(l)) + goto drop; } + /* Don't send probe at next timeout expiration */ l->silent_intv_cnt = 0; /* Forward queues and wake up waiting users */ @@ -1095,37 +1106,36 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, } /* Defer reception if there is a gap in the sequence */ - seqno = msg_seqno(hdr); - rcv_nxt = l->rcv_nxt; if (unlikely(less(rcv_nxt, seqno))) { - l->stats.deferred_recv++; - goto exit; + __tipc_skb_queue_sorted(defq, skb); + tipc_link_build_nack_msg(l, xmitq); + break; } - __skb_dequeue(arrvq); - /* Drop if packet already received */ if (unlikely(more(rcv_nxt, seqno))) { l->stats.duplicates++; - kfree_skb(skb); - goto exit; + goto drop; } /* Packet can be delivered */ l->rcv_nxt++; l->stats.recv_info++; - if (unlikely(!tipc_data_input(l, skb, l->inputq))) + + if (!tipc_data_input(l, skb, l->inputq)) rc = tipc_link_input(l, skb, l->inputq); + if (rc) + break; /* Ack at regular intervals */ - if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { - l->rcv_unacked = 0; - l->stats.sent_acks++; - tipc_link_build_proto_msg(l, STATE_MSG, - 0, 0, 0, 0, xmitq); - } - } -exit: + if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) + tipc_link_build_ack_msg(l, xmitq); + + } while ((skb = __skb_dequeue(defq))); + + return rc; +drop: + kfree_skb(skb); return rc; } @@ -1249,7 +1259,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, } /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets - * with contents of the link's tranmsit and backlog queues. + * with contents of the link's transmit and backlog queues. */ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq) -- cgit v1.1 From 81204c492b05274ade680c54787cd8ba234dcfd7 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:42 -0400 Subject: tipc: improve sequence number checking The sequence number of an incoming packet is currently only checked for less than, equality to, or bigger than the next expected number, meaning that the receive window in practice becomes one half sequence number cycle, or U16_MAX/2. This does not make sense, and may not even be safe if there are extreme delays in the network. Any packet sent by the peer during the ongoing cycle must belong inside his current send window, or should otherwise be dropped if possible. Since a link endpoint cannot know its peer's current send window, it has to base this sanity check on a worst-case assumption, i.e., that the peer is using a maximum sized window of 8191 packets. Using this assumption, we now add a check that the sequence number is not bigger than next_expected + TIPC_MAX_LINK_WIN. We also re-order the checks done, so that the receive window test is performed before the gap test. This way, we are guaranteed that no packet with illegal sequence numbers are ever added to the deferred queue. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 26 ++++++++++++-------------- net/tipc/link.h | 2 +- 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 8e23ab5..2b549f6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1077,13 +1077,14 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, { struct sk_buff_head *defq = &l->deferdq; struct tipc_msg *hdr; - u16 seqno, rcv_nxt; + u16 seqno, rcv_nxt, win_lim; int rc = 0; do { hdr = buf_msg(skb); seqno = msg_seqno(hdr); rcv_nxt = l->rcv_nxt; + win_lim = rcv_nxt + TIPC_MAX_LINK_WIN; /* Verify and update link state */ if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) @@ -1098,6 +1099,12 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Don't send probe at next timeout expiration */ l->silent_intv_cnt = 0; + /* Drop if outside receive window */ + if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { + l->stats.duplicates++; + goto drop; + } + /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { tipc_link_advance_backlog(l, xmitq); @@ -1105,29 +1112,20 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, link_prepare_wakeup(l); } - /* Defer reception if there is a gap in the sequence */ - if (unlikely(less(rcv_nxt, seqno))) { + /* Defer delivery if sequence gap */ + if (unlikely(seqno != rcv_nxt)) { __tipc_skb_queue_sorted(defq, skb); tipc_link_build_nack_msg(l, xmitq); break; } - /* Drop if packet already received */ - if (unlikely(more(rcv_nxt, seqno))) { - l->stats.duplicates++; - goto drop; - } - - /* Packet can be delivered */ + /* Deliver packet */ l->rcv_nxt++; l->stats.recv_info++; - if (!tipc_data_input(l, skb, l->inputq)) rc = tipc_link_input(l, skb, l->inputq); - if (rc) + if (unlikely(rc)) break; - - /* Ack at regular intervals */ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) tipc_link_build_ack_msg(l, xmitq); diff --git a/net/tipc/link.h b/net/tipc/link.h index 39ff8b6..7a1ad42 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -185,7 +185,7 @@ struct tipc_link { } backlog[5]; u16 snd_nxt; u16 last_retransm; - u32 window; + u16 window; u32 stale_count; /* Reception */ -- cgit v1.1 From 8306f99a517b91ebf8fa94d017c2c84ca62e107c Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:43 -0400 Subject: tipc: disallow packet duplicates in link deferred queue After the previous commits, we are guaranteed that no packets of type LINK_PROTOCOL or with illegal sequence numbers will be attempted added to the link deferred queue. This makes it possible to make some simplifications to the sorting algorithm in the function tipc_skb_queue_sorted(). We also alter the function so that it will drop packets if one with the same seqeunce number is already present in the queue. This is necessary because we have identified weird packet sequences, involving duplicate packets, where a legitimate in-sequence packet may advance to the head of the queue without being detected and de-queued. Finally, we make this function outline, since it will now be called only in exceptional cases. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 2 +- net/tipc/msg.c | 31 +++++++++++++++++++++++++++++++ net/tipc/msg.h | 34 ++-------------------------------- 3 files changed, 34 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 2b549f6..e7c6086 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1114,7 +1114,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { - __tipc_skb_queue_sorted(defq, skb); + __tipc_skb_queue_sorted(defq, seqno, skb); tipc_link_build_nack_msg(l, xmitq); break; } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index c5ac436..454f5ec 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -590,3 +590,34 @@ error: kfree_skb(head); return NULL; } + +/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number + * @list: list to be appended to + * @seqno: sequence number of buffer to add + * @skb: buffer to add + */ +void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, + struct sk_buff *skb) +{ + struct sk_buff *_skb, *tmp; + + if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) { + __skb_queue_head(list, skb); + return; + } + + if (more(seqno, buf_seqno(skb_peek_tail(list)))) { + __skb_queue_tail(list, skb); + return; + } + + skb_queue_walk_safe(list, _skb, tmp) { + if (more(seqno, buf_seqno(_skb))) + continue; + if (seqno == buf_seqno(_skb)) + break; + __skb_queue_before(list, _skb, skb); + return; + } + kfree_skb(skb); +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a82c584..c784ba0 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -790,6 +790,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list); +void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, + struct sk_buff *skb); static inline u16 buf_seqno(struct sk_buff *skb) { @@ -862,38 +864,6 @@ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, return skb; } -/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number - * @list: list to be appended to - * @skb: buffer to add - * Returns true if queue should treated further, otherwise false - */ -static inline bool __tipc_skb_queue_sorted(struct sk_buff_head *list, - struct sk_buff *skb) -{ - struct sk_buff *_skb, *tmp; - struct tipc_msg *hdr = buf_msg(skb); - u16 seqno = msg_seqno(hdr); - - if (skb_queue_empty(list) || (msg_user(hdr) == LINK_PROTOCOL)) { - __skb_queue_head(list, skb); - return true; - } - if (likely(less(seqno, buf_seqno(skb_peek(list))))) { - __skb_queue_head(list, skb); - return true; - } - if (!more(seqno, buf_seqno(skb_peek_tail(list)))) { - skb_queue_walk_safe(list, _skb, tmp) { - if (likely(less(seqno, buf_seqno(_skb)))) { - __skb_queue_before(list, _skb, skb); - return true; - } - } - } - __skb_queue_tail(list, skb); - return false; -} - /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. -- cgit v1.1 From 73f646cec35477b5099d7e952297cb9e1855be45 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:44 -0400 Subject: tipc: delay ESTABLISH state event when link is established Link establishing, just like link teardown, is a non-atomic action, in the sense that discovering that conditions are right to establish a link, and the actual adding of the link to one of the node's send slots is done in two different lock contexts. The link FSM is designed to help bridging the gap between the two contexts in a safe manner. We have now discovered a weakness in the implementaton of this FSM. Because we directly let the link go from state LINK_ESTABLISHING to state LINK_ESTABLISHED already in the first lock context, we are unable to distinguish between a fully established link, i.e., a link that has been added to its slot, and a link that has not yet reached the second lock context. It may hence happen that a manual intervention, e.g., when disabling an interface, causes the function tipc_node_link_down() to try removing the link from the node slots, decrementing its active link counter etc, although the link was never added there in the first place. We solve this by delaying the actual state change until we reach the second lock context, inside the function tipc_node_link_up(). This makes it possible for potentail callers of __tipc_node_link_down() to know if they should proceed or not, and the problem is solved. Unforunately, the situation described above also has a second problem. Since there by necessity is a tipc_node_link_up() call pending once the node lock has been released, we must defuse that call by setting the link back from LINK_ESTABLISHING to LINK_RESET state. This forces us to make a slight modification to the link FSM, which will now look as follows. +------------------------------------+ |RESET_EVT | | | | +--------------+ | +-----------------| SYNCHING |-----------------+ | |FAILURE_EVT +--------------+ PEER_RESET_EVT| | | A | | | | | | | | | | | | | | |SYNCH_ |SYNCH_ | | | |BEGIN_EVT |END_EVT | | | | | | | V | V V | +-------------+ +--------------+ +------------+ | | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET | | +-------------+ FAILURE_ +--------------+ PEER_ +------------+ | | EVT | A RESET_EVT | | | | | | | | +----------------+ | | | RESET_EVT| |RESET_EVT | | | | | | | | | | |ESTABLISH_EVT | | | | +-------------+ | | | | | | RESET_EVT | | | | | | | | | | | V V V | | | | +-------------+ +--------------+ RESET_EVT| +--->| RESET |--------->| ESTABLISHING |<----------------+ +-------------+ PEER_ +--------------+ | A RESET_EVT | | | | | | | |FAILOVER_ |FAILOVER_ |FAILOVER_ |BEGIN_EVT |END_EVT |BEGIN_EVT | | | V | | +-------------+ | | FAILINGOVER |<----------------+ +-------------+ Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 40 ++++++++++++++++++++++++++-------------- net/tipc/link.h | 1 + net/tipc/node.c | 31 ++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index e7c6086..8c794c1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -125,6 +125,11 @@ bool tipc_link_is_reset(struct tipc_link *l) return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING); } +bool tipc_link_is_establishing(struct tipc_link *l) +{ + return l->state == LINK_ESTABLISHING; +} + bool tipc_link_is_synching(struct tipc_link *l) { return l->state == LINK_SYNCHING; @@ -321,14 +326,15 @@ int tipc_link_fsm_evt(struct tipc_link *l, int evt) switch (evt) { case LINK_ESTABLISH_EVT: l->state = LINK_ESTABLISHED; - rc |= TIPC_LINK_UP_EVT; break; case LINK_FAILOVER_BEGIN_EVT: l->state = LINK_FAILINGOVER; break; - case LINK_PEER_RESET_EVT: case LINK_RESET_EVT: + l->state = LINK_RESET; + break; case LINK_FAILURE_EVT: + case LINK_PEER_RESET_EVT: case LINK_SYNCH_BEGIN_EVT: case LINK_FAILOVER_END_EVT: break; @@ -1091,9 +1097,9 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, return tipc_link_proto_rcv(l, skb, xmitq); if (unlikely(!link_is_up(l))) { - rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); - if (!link_is_up(l)) - goto drop; + if (l->state == LINK_ESTABLISHING) + rc = TIPC_LINK_UP_EVT; + goto drop; } /* Don't send probe at next timeout expiration */ @@ -1338,6 +1344,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); u16 rcv_nxt = l->rcv_nxt; + int mtyp = msg_type(hdr); char *if_name; int rc = 0; @@ -1347,7 +1354,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (link_own_addr(l) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); - switch (msg_type(hdr)) { + switch (mtyp) { case RESET_MSG: /* Ignore duplicate RESET with old session number */ @@ -1374,12 +1381,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; - if (msg_type(hdr) == RESET_MSG) { - rc |= tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); - } else if (!link_is_up(l)) { - tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); - rc |= tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); - } + /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ + if ((mtyp == RESET_MSG) || !link_is_up(l)) + rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); + + /* ACTIVATE_MSG takes up link if it was already locally reset */ + if ((mtyp == ACTIVATE_MSG) && (l->state == LINK_ESTABLISHING)) + rc = TIPC_LINK_UP_EVT; + l->peer_session = msg_session(hdr); l->peer_bearer_id = msg_bearer_id(hdr); if (l->mtu > msg_max_pkt(hdr)) @@ -1396,9 +1405,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->stats.recv_states++; if (msg_probe(hdr)) l->stats.recv_probes++; - rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); - if (!link_is_up(l)) + + if (!link_is_up(l)) { + if (l->state == LINK_ESTABLISHING) + rc = TIPC_LINK_UP_EVT; break; + } /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) diff --git a/net/tipc/link.h b/net/tipc/link.h index 7a1ad42..d42dfc0 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -217,6 +217,7 @@ int tipc_link_fsm_evt(struct tipc_link *l, int evt); void tipc_link_reset_fragments(struct tipc_link *l_ptr); bool tipc_link_is_up(struct tipc_link *l); bool tipc_link_is_reset(struct tipc_link *l); +bool tipc_link_is_establishing(struct tipc_link *l); bool tipc_link_is_synching(struct tipc_link *l); bool tipc_link_is_failingover(struct tipc_link *l); bool tipc_link_is_blocked(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index 703875f..656b579 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -317,7 +317,11 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; - if (!nl || !tipc_link_is_up(nl)) + if (!nl) + return; + + tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); + if (!tipc_link_is_up(nl)) return; n->working_links++; @@ -437,17 +441,26 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; + struct tipc_link *l = le->link; struct tipc_media_addr *maddr; struct sk_buff_head xmitq; + if (!l) + return; + __skb_queue_head_init(&xmitq); tipc_node_lock(n); - __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); - if (delete && le->link) { - kfree(le->link); - le->link = NULL; - n->link_cnt--; + if (!tipc_link_is_establishing(l)) { + __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); + if (delete) { + kfree(l); + le->link = NULL; + n->link_cnt--; + } + } else { + /* Defuse pending tipc_node_link_up() */ + tipc_link_fsm_evt(l, LINK_RESET_EVT); } tipc_node_unlock(n); @@ -579,7 +592,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_unlock(n); - if (reset) + if (reset && !tipc_link_is_reset(l)) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } @@ -686,10 +699,10 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: - break; case NODE_SYNCH_END_EVT: - case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: + break; + case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; -- cgit v1.1 From 282b3a056225b35024246f63feb91d769d714dad Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:45 -0400 Subject: tipc: send out RESET immediately when link goes down When a link is taken down because of a node local event, such as disabling of a bearer or an interface, we currently leave it to the peer node to discover the broken communication. The default time for such failure discovery is 1.5-2 seconds. If we instead allow the terminating link endpoint to send out a RESET message at the moment it is reset, we can achieve the impression that both endpoints are going down instantly. Since this is a very common scenario, we find it worthwhile to make this small modification. Apart from letting the link produce the said message, we also have to ensure that the interface is able to transmit it before TIPC is detached. We do this by performing the disabling of a bearer in three steps: 1) Disable reception of TIPC packets from the interface in question. 2) Take down the links, while allowing them so send out a RESET message. 3) Disable transmission of TIPC packets on the interface. Apart from this, we now have to react on the NETDEV_GOING_DOWN event, instead of as currently the NEDEV_DOWN event, to ensure that such transmission is possible during the teardown phase. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 8 +++----- net/tipc/link.c | 12 ++++++++++++ net/tipc/link.h | 1 + net/tipc/node.c | 5 +++-- net/tipc/udp_media.c | 1 - 5 files changed, 19 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index ce9f7bf..82b2786 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -362,6 +362,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr) b_ptr->media->disable_media(b_ptr); tipc_node_delete_links(net, b_ptr->identity); + RCU_INIT_POINTER(b_ptr->media_ptr, NULL); if (b_ptr->link_req) tipc_disc_delete(b_ptr->link_req); @@ -399,16 +400,13 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, /* tipc_disable_l2_media - detach TIPC bearer from an L2 interface * - * Mark L2 bearer as inactive so that incoming buffers are thrown away, - * then get worker thread to complete bearer cleanup. (Can't do cleanup - * here because cleanup code needs to sleep and caller holds spinlocks.) + * Mark L2 bearer as inactive so that incoming buffers are thrown away */ void tipc_disable_l2_media(struct tipc_bearer *b) { struct net_device *dev; dev = (struct net_device *)rtnl_dereference(b->media_ptr); - RCU_INIT_POINTER(b->media_ptr, NULL); RCU_INIT_POINTER(dev->tipc_ptr, NULL); synchronize_net(); dev_put(dev); @@ -554,7 +552,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; - case NETDEV_DOWN: + case NETDEV_GOING_DOWN: case NETDEV_CHANGEMTU: tipc_reset_bearer(net, b_ptr); break; diff --git a/net/tipc/link.c b/net/tipc/link.c index 8c794c1..737b598 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1062,6 +1062,18 @@ void tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); } +/* tipc_link_build_reset_msg: prepare link RESET or ACTIVATE message + */ +void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + int mtyp = RESET_MSG; + + if (l->state == LINK_ESTABLISHING) + mtyp = ACTIVATE_MSG; + + tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq); +} + /* tipc_link_build_nack_msg: prepare link nack message for transmission */ static void tipc_link_build_nack_msg(struct tipc_link *l, diff --git a/net/tipc/link.h b/net/tipc/link.h index d42dfc0..5872f09 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -213,6 +213,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_bcast_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); +void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); void tipc_link_reset_fragments(struct tipc_link *l_ptr); bool tipc_link_is_up(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index 656b579..fba6e1a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -41,7 +41,7 @@ #include "socket.h" #include "bcast.h" #include "discover.h" - +#define pr_debug printk /* Node FSM states and events: */ enum { @@ -421,6 +421,8 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, if (!tipc_node_is_up(n)) { tipc_link_reset(l); + tipc_link_build_reset_msg(l, xmitq); + *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); return; } @@ -463,7 +465,6 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_link_fsm_evt(l, LINK_RESET_EVT); } tipc_node_unlock(n); - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); tipc_sk_rcv(n->net, &le->inputq); } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index c170d31..9bc0b1e 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -425,7 +425,6 @@ static void tipc_udp_disable(struct tipc_bearer *b) } if (ub->ubsock) sock_set_flag(ub->ubsock->sk, SOCK_DEAD); - RCU_INIT_POINTER(b->media_ptr, NULL); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ -- cgit v1.1 From c819930090fe3f74c822be765c185b3431360193 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 15 Oct 2015 14:52:46 -0400 Subject: tipc: update node FSM when peer RESET message is received The change made in the previous commit revealed a small flaw in the way the node FSM is updated. When the function tipc_node_link_down() is called for the last link to a node, we should check whether this was caused by a local reset or by a received RESET message from the peer. In the latter case, we can directly issue a PEER_LOST_CONTACT_EVT to the node FSM, so that it is ready to re-establish contact. If this is not done, the peer node will sometimes have to go through a second establish cycle before the link becomes stable. We fix this in this commit by conditionally issuing the mentioned event in the function tipc_node_link_down(). We also move LINK_RESET FSM even away from the link_reset() function and into the caller function, partially because it is easier to follow the code when state changes are gathered at a limited number of locations, partially because there will be cases in future commits where we don't want the link to go RESET mode when link_reset() is called. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 7 +++++-- net/tipc/link.h | 1 + net/tipc/node.c | 11 +++++++---- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 737b598..ff9b0b9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -120,6 +120,11 @@ bool tipc_link_is_up(struct tipc_link *l) return link_is_up(l); } +bool tipc_link_peer_is_down(struct tipc_link *l) +{ + return l->state == LINK_PEER_RESET; +} + bool tipc_link_is_reset(struct tipc_link *l) { return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING); @@ -584,8 +589,6 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr) void tipc_link_reset(struct tipc_link *l) { - tipc_link_fsm_evt(l, LINK_RESET_EVT); - /* Link is down, accept any session */ l->peer_session = WILDCARD_SESSION; diff --git a/net/tipc/link.h b/net/tipc/link.h index 5872f09..0201212 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -217,6 +217,7 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); void tipc_link_reset_fragments(struct tipc_link *l_ptr); bool tipc_link_is_up(struct tipc_link *l); +bool tipc_link_peer_is_down(struct tipc_link *l); bool tipc_link_is_reset(struct tipc_link *l); bool tipc_link_is_establishing(struct tipc_link *l); bool tipc_link_is_synching(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index fba6e1a..d1f3401 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -41,7 +41,7 @@ #include "socket.h" #include "bcast.h" #include "discover.h" -#define pr_debug printk + /* Node FSM states and events: */ enum { @@ -420,6 +420,10 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, } if (!tipc_node_is_up(n)) { + if (tipc_link_peer_is_down(l)) + tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); + tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; @@ -434,6 +438,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); tipc_link_reset(l); + tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[tnl->bearer_id].maddr; @@ -581,6 +586,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, goto exit; } tipc_link_reset(l); + tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); le->link = l; @@ -863,9 +869,6 @@ static void node_lost_contact(struct tipc_node *n_ptr, tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } - /* Prevent re-contact with node until cleanup is done */ - tipc_node_fsm_evt(n_ptr, SELF_LOST_CONTACT_EVT); - /* Notify publications from this node */ n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN; -- cgit v1.1 From ef84d8ce5a36d0c4a6454e7e9dff54d19f96a25f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 11:16:26 -0700 Subject: Revert "inet: fix double request socket freeing" This reverts commit c69736696cf3742b37d850289dc0d7ead177bb14. At the time of above commit, tcp_req_err() and dccp_req_err() were dead code, as SYN_RECV request sockets were not yet in ehash table. Real bug was fixed later in a different commit. We need to revert to not leak a refcount on request socket. inet_csk_reqsk_queue_drop_and_put() will be added in following commit to make clean inet_csk_reqsk_queue_drop() does not release the reference owned by caller. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0dcf196..644af51 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -208,7 +208,6 @@ void dccp_req_err(struct sock *sk, u64 seq) if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in RESPOND, just remove it silently. @@ -218,6 +217,7 @@ void dccp_req_err(struct sock *sk, u64 seq) */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); } + reqsk_put(req); } EXPORT_SYMBOL(dccp_req_err); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1ff0923..aad2298 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in SYN_RECV, just remove it silently. @@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq) * created socket, and POSIX does not want network * errors returned from accept(). */ - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); } + reqsk_put(req); } EXPORT_SYMBOL(tcp_req_err); -- cgit v1.1 From f03f2e154f52fdaa982de7e2c386737679963dc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 11:16:27 -0700 Subject: tcp/dccp: add inet_csk_reqsk_queue_drop_and_put() helper Let's reduce the confusion about inet_csk_reqsk_queue_drop() : In many cases we also need to release reference on request socket, so add a helper to do this, reducing code size and complexity. Fixes: 4bdc3d66147b ("tcp/dccp: fix behavior of stale SYN_RECV request sockets") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/inet_connection_sock.c | 10 ++++++++-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 644af51..59bc180 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -828,7 +828,7 @@ lookup: if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6883193..d9cc731 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -686,7 +686,7 @@ lookup: if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ba9ec9a..b85c720 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -546,6 +546,13 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); +void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) +{ + inet_csk_reqsk_queue_drop(sk, req); + reqsk_put(req); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; @@ -608,8 +615,7 @@ static void reqsk_timer_handler(unsigned long data) return; } drop: - inet_csk_reqsk_queue_drop(sk_listener, req); - reqsk_put(req); + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); } static void reqsk_queue_hash_req(struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aad2298..9c68cf3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1591,7 +1591,7 @@ process: if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7ce1c57..acb06f8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1386,7 +1386,7 @@ process: if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { -- cgit v1.1 From ebb516af60e18258aac8e80bbe068740ef1579ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 11:16:28 -0700 Subject: tcp/dccp: fix race at listener dismantle phase Under stress, a close() on a listener can trigger the WARN_ON(sk->sk_ack_backlog) in inet_csk_listen_stop() We need to test if listener is still active before queueing a child in inet_csk_reqsk_queue_add() Create a common inet_child_forget() helper, and use it from inet_csk_reqsk_queue_add() and inet_csk_listen_stop() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 71 ++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b85c720..8430bc8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -764,6 +764,53 @@ int inet_csk_listen_start(struct sock *sk, int backlog) } EXPORT_SYMBOL_GPL(inet_csk_listen_start); +static void inet_child_forget(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != req->rsk_listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + } + inet_csk_destroy_sock(child); + reqsk_put(req); +} + +void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + spin_lock(&queue->rskq_lock); + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_child_forget(sk, req, child); + } else { + req->sk = child; + req->dl_next = NULL; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_head = req; + else + queue->rskq_accept_tail->dl_next = req; + queue->rskq_accept_tail = req; + sk_acceptq_added(sk); + } + spin_unlock(&queue->rskq_lock); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_add); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. @@ -790,31 +837,11 @@ void inet_csk_listen_stop(struct sock *sk) WARN_ON(sock_owned_by_user(child)); sock_hold(child); - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - percpu_counter_inc(sk->sk_prot->orphan_count); - - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != req->rsk_listener); - - /* Paranoid, to prevent race condition if - * an inbound pkt destined for child is - * blocked by sock lock in tcp_v4_rcv(). - * Also to satisfy an assertion in - * tcp_v4_destroy_sock(). - */ - tcp_sk(child)->fastopen_rsk = NULL; - } - inet_csk_destroy_sock(child); - + inet_child_forget(sk, req, child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); - reqsk_put(req); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { @@ -829,7 +856,7 @@ void inet_csk_listen_stop(struct sock *sk) req = next; } } - WARN_ON(sk->sk_ack_backlog); + WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); -- cgit v1.1 From 51161aa98d0aa4eb20952e16d6c6dbb1d085330e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 14 Oct 2015 16:44:00 -0700 Subject: net: Fix suspicious RCU usage in fib_rebalance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This command: ip route add 192.168.1.0/24 nexthop via 10.2.1.5 dev eth1 nexthop via 10.2.2.5 dev eth2 generated this suspicious RCU usage message: [ 63.249262] [ 63.249939] =============================== [ 63.251571] [ INFO: suspicious RCU usage. ] [ 63.253250] 4.3.0-rc3+ #298 Not tainted [ 63.254724] ------------------------------- [ 63.256401] ../include/linux/inetdevice.h:205 suspicious rcu_dereference_check() usage! [ 63.259450] [ 63.259450] other info that might help us debug this: [ 63.259450] [ 63.262297] [ 63.262297] rcu_scheduler_active = 1, debug_locks = 1 [ 63.264647] 1 lock held by ip/2870: [ 63.265896] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x12/0x14 [ 63.268858] [ 63.268858] stack backtrace: [ 63.270409] CPU: 4 PID: 2870 Comm: ip Not tainted 4.3.0-rc3+ #298 [ 63.272478] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 63.275745] 0000000000000001 ffff8800b8c9f8b8 ffffffff8125f73c ffff88013afcf301 [ 63.278185] ffff8800bab7a380 ffff8800b8c9f8e8 ffffffff8107bf30 ffff8800bb728000 [ 63.280634] ffff880139fe9a60 0000000000000000 ffff880139fe9a00 ffff8800b8c9f908 [ 63.283177] Call Trace: [ 63.283959] [] dump_stack+0x4c/0x68 [ 63.285593] [] lockdep_rcu_suspicious+0xfa/0x103 [ 63.287500] [] __in_dev_get_rcu+0x48/0x4f [ 63.289169] [] fib_rebalance+0x3e/0x127 [ 63.290753] [] ? rcu_read_unlock+0x3e/0x5f [ 63.292442] [] fib_create_info+0xaf9/0xdcc [ 63.294093] [] ? sched_clock_local+0x12/0x75 [ 63.295791] [] fib_table_insert+0x8c/0x451 [ 63.297493] [] ? fib_get_table+0x36/0x43 [ 63.299109] [] inet_rtm_newroute+0x43/0x51 [ 63.300709] [] rtnetlink_rcv_msg+0x182/0x195 [ 63.302334] [] ? trace_hardirqs_on+0xd/0xf [ 63.303888] [] ? rtnl_lock+0x12/0x14 [ 63.305346] [] ? __rtnl_unlock+0x12/0x12 [ 63.306878] [] netlink_rcv_skb+0x3d/0x90 [ 63.308437] [] rtnetlink_rcv+0x21/0x28 [ 63.309916] [] netlink_unicast+0xfa/0x17f [ 63.311447] [] netlink_sendmsg+0x297/0x2dc [ 63.313029] [] sock_sendmsg_nosec+0x12/0x1d [ 63.314597] [] ___sys_sendmsg+0x196/0x21b [ 63.316125] [] ? native_sched_clock+0x1f/0x3c [ 63.317671] [] ? sched_clock_local+0x12/0x75 [ 63.319185] [] ? sched_clock_cpu+0x9d/0xb6 [ 63.320693] [] ? __lock_is_held+0x32/0x54 [ 63.322145] [] ? __fget_light+0x4b/0x77 [ 63.323541] [] __sys_sendmsg+0x3d/0x5b [ 63.324947] [] SyS_sendmsg+0xd/0x19 [ 63.326274] [] entry_SYSCALL_64_fastpath+0x12/0x6f It looks like all of the code paths to fib_rebalance are under rtnl. Fixes: 0e884c78ee19 ("ipv4: L3 hash-based multipath") Cc: Peter Nørlund Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index af77298..42778d9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -545,7 +545,7 @@ static void fib_rebalance(struct fib_info *fi) if (nh->nh_flags & RTNH_F_DEAD) continue; - in_dev = __in_dev_get_rcu(nh->nh_dev); + in_dev = __in_dev_get_rtnl(nh->nh_dev); if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && @@ -559,7 +559,7 @@ static void fib_rebalance(struct fib_info *fi) change_nexthops(fi) { int upper_bound; - in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); if (nexthop_nh->nh_flags & RTNH_F_DEAD) { upper_bound = -1; -- cgit v1.1 From 573c7ba006edbecff0714db651dd3602b9d0a6a0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 16 Oct 2015 14:01:22 +0200 Subject: net: introduce pre-change upper device notifier This newly introduced netdevice notifier is called before actual change upper happens. That provides a possibility for notifier handlers to know upper change will happen and react to it, including possibility to forbid the change. That is valuable for drivers which can check if the upper device linkage is supported and forbid that in case it is not. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a229bf0..1225b4b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5346,6 +5346,12 @@ static int __netdev_upper_dev_link(struct net_device *dev, changeupper_info.master = master; changeupper_info.linking = true; + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, master); if (ret) @@ -5488,6 +5494,9 @@ void netdev_upper_dev_unlink(struct net_device *dev, changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; changeupper_info.linking = false; + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + &changeupper_info.info); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower -- cgit v1.1