From 7016e0627171878810798a842a416dddee4e3329 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 13 Sep 2017 13:58:15 -0700 Subject: net: Convert int functions to bool Global function ipv6_rcv_saddr_equal and static functions ipv6_rcv_saddr_equal and ipv4_rcv_saddr_equal currently return int. bool is slightly more descriptive for these functions so change their return type from int to bool. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f44ff24..87981cd 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -94,8 +94,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard); +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); -- cgit v1.1 From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 05:14:24 -0700 Subject: net: sk_buff rbnode reorg skb->rbnode shares space with skb->next, skb->prev and skb->tstamp Current uses (TCP receive ofo queue and netem) need to save/restore tstamp, while skb->dev is either NULL (TCP) or a constant for a given queue (netem). Since we plan using an RB tree for TCP retransmit queue to speedup SACK processing with large BDP, this patch exchanges skb->dev and skb->tstamp. This saves some overhead in both TCP and netem. v2: removes the swtstamp field from struct tcp_skb_cb Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Wei Wang Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/skbuff.h | 16 ++++++++-------- include/net/tcp.h | 6 ------ 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 72299ef..4928288 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -661,8 +661,12 @@ struct sk_buff { struct sk_buff *prev; union { - ktime_t tstamp; - u64 skb_mstamp; + struct net_device *dev; + /* Some protocols might use this space to store information, + * while device pointer would be NULL. + * UDP receive path is one user. + */ + unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ @@ -670,12 +674,8 @@ struct sk_buff { struct sock *sk; union { - struct net_device *dev; - /* Some protocols might use this space to store information, - * while device pointer would be NULL. - * UDP receive path is one user. - */ - unsigned long dev_scratch; + ktime_t tstamp; + u64 skb_mstamp; }; /* * This is the control buffer. It is free to use for every diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f28..49a8a46 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -797,12 +797,6 @@ struct tcp_skb_cb { u16 tcp_gso_segs; u16 tcp_gso_size; }; - - /* Used to stash the receive timestamp while this skb is in the - * out of order queue, as skb->tstamp is overwritten by the - * rbnode. - */ - ktime_t swtstamp; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ -- cgit v1.1 From f5619866592c65adc087364cc1a3ba709201ea26 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:57 -0400 Subject: net: dsa: remove copy of master ethtool_ops There is no need to store a copy of the master ethtool ops, storing the original pointer in DSA and the new one in the master netdev itself is enough. In the meantime, set orig_ethtool_ops to NULL when restoring the master ethtool ops and check the presence of the master original ethtool ops as well as its needed functions before calling them. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index dd44d6c..8dee216 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -188,7 +188,6 @@ struct dsa_port { /* * Original copy of the master netdev ethtool_ops */ - struct ethtool_ops ethtool_ops; const struct ethtool_ops *orig_ethtool_ops; }; -- cgit v1.1 From 752fbcc33405d6f8249465e4b2c4e420091bb825 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 19 Sep 2017 13:15:42 -0700 Subject: net_sched: no need to free qdisc in RCU callback gen estimator has been rewritten in commit 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators"), the caller no longer needs to wait for a grace period. So this patch gets rid of it. Cc: Jamal Hadi Salim Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 135f5a2..684d8ed 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -93,7 +93,6 @@ struct Qdisc { unsigned long state; struct Qdisc *next_sched; struct sk_buff *skb_bad_txq; - struct rcu_head rcu_head; int padded; refcount_t refcnt; -- cgit v1.1 From a90c9347e90ed1e9323d71402ed18023bc910cd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:06 -0700 Subject: ipv6: addrlabel: per netns list Having a global list of labels do not scale to thousands of netns in the cloud era. This causes quadratic behavior on netns creation and deletion. This is time having a per netns list of ~10 labels. Tested: $ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ] real 0m20.837s # instead of 0m24.227s user 0m0.328s sys 0m20.338s # instead of 0m23.753s 16.17% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 12.30% ip [kernel.kallsyms] [k] netlink_has_listeners 6.76% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.78% ip [kernel.kallsyms] [k] memset_erms 5.77% ip [kernel.kallsyms] [k] kobject_uevent_env 5.18% ip [kernel.kallsyms] [k] refcount_sub_and_test 4.96% ip [kernel.kallsyms] [k] _raw_read_lock 3.82% ip [kernel.kallsyms] [k] refcount_inc_not_zero 3.33% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 2.11% ip [kernel.kallsyms] [k] unmap_page_range 1.77% ip [kernel.kallsyms] [k] __wake_up 1.69% ip [kernel.kallsyms] [k] strlen 1.17% ip [kernel.kallsyms] [k] __wake_up_common 1.09% ip [kernel.kallsyms] [k] insert_header 1.04% ip [kernel.kallsyms] [k] page_remove_rmap 1.01% ip [kernel.kallsyms] [k] consume_skb 0.98% ip [kernel.kallsyms] [k] netlink_trim 0.51% ip [kernel.kallsyms] [k] kernfs_link_sibling 0.51% ip [kernel.kallsyms] [k] filemap_map_pages 0.46% ip [kernel.kallsyms] [k] memcpy_erms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2544f97..2ea1ed3 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -89,6 +89,11 @@ struct netns_ipv6 { atomic_t fib6_sernum; struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; + struct { + struct hlist_head head; + spinlock_t lock; + u32 seq; + } ip6addrlbl_table; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) -- cgit v1.1 From 64bc17811b72758753e2b64cd8f2a63812c61fe1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:09 -0700 Subject: ipv4: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s After patch: $ time ./add_del_unshare.sh net_namespace 135 291 5504 1 2 : tunables 8 4 0 : slabdata 135 291 0 real 0m22.117s user 0m0.728s sys 0m35.328s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9926528..b41a1e0 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -258,7 +258,8 @@ int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); +void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, + struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); -- cgit v1.1 From 2512b1b18d0748d867bb22387db7c86b903291ad Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Sat, 5 Aug 2017 11:44:31 +0300 Subject: mac80211: extend ieee80211_ie_split to support EXTENSION Current ieee80211_ie_split() implementation doesn't account for elements that are sub-elements of the EXTENSION IE. To extend support to these IEs as well, treat the WLAN_EID_EXTENSION ids in the %ids array as indicating that the next id in the array is a sub-element of the EXTENSION IE. Signed-off-by: Liad Kaufman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f12fa52..aa9d993 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5934,7 +5934,8 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, * @ies: the IE buffer * @ielen: the length of the IE buffer * @ids: an array with element IDs that are allowed before - * the split + * the split. A WLAN_EID_EXTENSION value means that the next + * EID in the list is a sub-element of the EXTENSION IE. * @n_ids: the size of the element ID array * @after_ric: array IE types that come after the RIC element * @n_after_ric: size of the @after_ric array @@ -5965,7 +5966,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * @ies: the IE buffer * @ielen: the length of the IE buffer * @ids: an array with element IDs that are allowed before - * the split + * the split. A WLAN_EID_EXTENSION value means that the next + * EID in the list is a sub-element of the EXTENSION IE. * @n_ids: the size of the element ID array * @offset: offset where to start splitting in the buffer * -- cgit v1.1 From 2d23d0736e3a4a0fdb92b8e46ea476639f16aae8 Mon Sep 17 00:00:00 2001 From: Roee Zamir Date: Sun, 6 Aug 2017 11:38:22 +0300 Subject: nl80211: add OCE scan and capability flags Add Optimized Connectivity Experience (OCE) scan and capability flags. Some of them unique to OCE and some are stand alone. And add scan flags to enable/disable them. Signed-off-by: Roee Zamir Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 51626b4..76404d8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4914,6 +4914,15 @@ enum nl80211_feature_flags { * handshake with 802.1X in station mode (will pass EAP frames to the host * and accept the set_pmk/del_pmk commands), doing it in the host might not * be supported. + * @NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME: Driver is capable of overriding + * the max channel attribute in the FILS request params IE with the + * actual dwell time. + * @NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP: Driver accepts broadcast probe + * response + * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE: Driver supports sending + * the first probe request in each channel at rate of at least 5.5Mbps. + * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports + * probe request tx deferral and suppression * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4936,6 +4945,10 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_FILS_SK_OFFLOAD, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5012,12 +5025,28 @@ enum nl80211_timeout_reason { * locally administered 1, multicast 0) is assumed. * This flag must not be requested when the feature isn't supported, check * the nl80211 feature flags for the device. + * @NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME: fill the dwell time in the FILS + * request parameters IE in the probe request + * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses + * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at + * rate of at least 5.5M. In case non OCE AP is dicovered in the channel, + * only the first probe req in the channel will be sent in high rate. + * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request + * tx deferral (dot11FILSProbeDelay shall be set to 15ms) + * and suppression (if it has received a broadcast Probe Response frame, + * Beacon frame or FILS Discovery frame from an AP that the STA considers + * a suitable candidate for (re-)association - suitable in terms of + * SSID and/or RSSI */ enum nl80211_scan_flags { - NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, - NL80211_SCAN_FLAG_FLUSH = 1<<1, - NL80211_SCAN_FLAG_AP = 1<<2, - NL80211_SCAN_FLAG_RANDOM_ADDR = 1<<3, + NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, + NL80211_SCAN_FLAG_FLUSH = 1<<1, + NL80211_SCAN_FLAG_AP = 1<<2, + NL80211_SCAN_FLAG_RANDOM_ADDR = 1<<3, + NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME = 1<<4, + NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP = 1<<5, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE = 1<<6, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION = 1<<7, }; /** -- cgit v1.1 From 1272c5d89b597995cb10db87dd4a1adc91d36006 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 18 Aug 2017 15:33:56 +0300 Subject: mac80211: add documentation to ieee80211_rx_ba_offl() Add documentation to ieee80211_rx_ba_offl() function and, while at it, rename the bit argument to tid, for consistency. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 885690f..cc9073e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5441,8 +5441,14 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, */ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn); +/** + * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work + * @vif: &struct ieee80211_vif pointer from the add_interface callback + * @addr: station mac address + * @tid: the rx tid + */ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr, - unsigned int bit); + unsigned int tid); /** * ieee80211_start_rx_ba_session_offl - start a Rx BA session -- cgit v1.1 From a6bcda44843c6dfced0fb973e2607c2a98addfa9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Sep 2017 11:52:43 +0200 Subject: cfg80211: remove unused function ieee80211_data_from_8023() This function hasn't been used since the removal of iwmc3200wifi in 2012. It also appears to have a bug when qos=True, since then it'll copy uninitialized stack memory to the SKB. Just remove the function entirely. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index aa9d993..cc19960 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4347,19 +4347,6 @@ static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, } /** - * ieee80211_data_from_8023 - convert an 802.3 frame to 802.11 - * @skb: the 802.3 frame - * @addr: the device MAC address - * @iftype: the virtual interface type - * @bssid: the network bssid (used only for iftype STATION and ADHOC) - * @qos: build 802.11 QoS data frame - * Return: 0 on success, or a negative error code. - */ -int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype, const u8 *bssid, - bool qos); - -/** * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame * * Decode an IEEE 802.11 A-MSDU and convert it to a list of 802.3 frames. -- cgit v1.1 From 65026002d69de006e273749bb799d3b01b757eb0 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 18 Aug 2017 15:31:41 +0300 Subject: nl80211: add an option to allow MFP without requiring it The user space can now allow the kernel to associate to an AP that requires MFP or that doesn't have MFP enabled in the same NL80211_CMD_CONNECT command, by using a new NL80211_MFP_OPTIONAL flag. The driver / firmware will decide whether to use it or not. Include a feature bit to advertise support for NL80211_MFP_OPTIONAL. This allows new user space to run on old kernels and know that it cannot use the new attribute if it isn't supported. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 76404d8..59ba6ca 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1407,8 +1407,12 @@ enum nl80211_commands { * * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is * used for the association (&enum nl80211_mfp, represented as a u32); - * this attribute can be used - * with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests + * this attribute can be used with %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_CONNECT requests. %NL80211_MFP_OPTIONAL is not allowed for + * %NL80211_CMD_ASSOCIATE since user space SME is expected and hence, it + * must have decided whether to use management frame protection or not. + * Setting %NL80211_MFP_OPTIONAL with a %NL80211_CMD_CONNECT request will + * let the driver (or the firmware) decide whether to use MFP or not. * * @NL80211_ATTR_STA_FLAGS2: Attribute containing a * &struct nl80211_sta_flag_update. @@ -3947,10 +3951,12 @@ enum nl80211_key_type { * enum nl80211_mfp - Management frame protection state * @NL80211_MFP_NO: Management frame protection not used * @NL80211_MFP_REQUIRED: Management frame protection required + * @NL80211_MFP_OPTIONAL: Management frame protection is optional */ enum nl80211_mfp { NL80211_MFP_NO, NL80211_MFP_REQUIRED, + NL80211_MFP_OPTIONAL, }; enum nl80211_wpa_versions { @@ -4923,6 +4929,8 @@ enum nl80211_feature_flags { * the first probe request in each channel at rate of at least 5.5Mbps. * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports * probe request tx deferral and suppression + * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL + * value in %NL80211_ATTR_USE_MFP. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4949,6 +4957,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP, NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE, NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, + NL80211_EXT_FEATURE_MFP_OPTIONAL, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.1 From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Sep 2017 18:26:53 +0200 Subject: net: avoid a full fib lookup when rp_filter is disabled. Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") a full fib lookup is needed even if the rp_filter is disabled, if accept_local is false - which is the default. What we really need in the above scenario is just checking that the source IP address is not local, and in most case we can do that is a cheaper way looking up the ifaddr hash table. This commit adds a helper for such lookup, and uses it to validate the src address when rp_filter is disabled and no 'local' routes are created by the user space in the relevant namespace. A new ipv4 netns flag is added to account for such routes. We need that to preserve the same behavior we had before this patch. It also drops the checks to bail early from __fib_validate_source, added by the commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") they do not give any measurable performance improvement: if we do the lookup with are on a slower path. This improves UDP performances for unconnected sockets when rp_filter is disabled by 5% and also gives small but measurable performance improvement for TCP flood scenarios. v1 -> v2: - use the ifaddr lookup helper in __ip_dev_find(), as suggested by Eric - fall-back to full lookup if custom local routes are present Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/net/netns/ipv4.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index fb3f809..751d051 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -179,6 +179,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask); +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr); static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa) { return !((addr^ifa->ifa_address)&ifa->ifa_mask); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 20d061c..2072072 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,7 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; + bool fib_has_custom_local_routes; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif -- cgit v1.1 From a1f3316dd7b5ce740c774697c664e2e60d095794 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 21 Sep 2017 18:18:23 -0700 Subject: ipv4: Move fib_has_custom_local_routes outside of IP_MULTIPLE_TABLES. > net/ipv4/fib_frontend.c: In function 'fib_validate_source': > net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes' > if (net->ipv4.fib_has_custom_local_routes) > ^ > net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute': > net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes' > net->ipv4.fib_has_custom_local_routes = true; > ^ Fixes: 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2072072..8387f09 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,10 +49,10 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; - bool fib_has_custom_local_routes; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif + bool fib_has_custom_local_routes; #ifdef CONFIG_IP_ROUTE_CLASSID int fib_num_tclassid_users; #endif -- cgit v1.1 From 242c1a28eb61cb34974e8aa05235d84355940a8a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Fri, 22 Sep 2017 10:25:22 +0800 Subject: net: Remove useless function skb_header_release There is no one which would invokes the function skb_header_release. So just remove it now. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4928288..f9db553 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1457,27 +1457,8 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) } /** - * skb_header_release - release reference to header - * @skb: buffer to operate on - * - * Drop a reference to the header part of the buffer. This is done - * by acquiring a payload reference. You must not read from the header - * part of skb->data after this. - * Note : Check if you can use __skb_header_release() instead. - */ -static inline void skb_header_release(struct sk_buff *skb) -{ - BUG_ON(skb->nohdr); - skb->nohdr = 1; - atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); -} - -/** * __skb_header_release - release reference to header * @skb: buffer to operate on - * - * Variant of skb_header_release() assuming skb is private to caller. - * We can avoid one atomic operation. */ static inline void __skb_header_release(struct sk_buff *skb) { -- cgit v1.1 From 373b8eeb0c15d4ce58f62afb12f213b1b5bbc3d3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:45:43 +0300 Subject: xfrm: make aead_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f002a2c..0be4c54 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1764,7 +1764,7 @@ static inline int xfrm_acquire_is_on(struct net *net) } #endif -static inline int aead_len(struct xfrm_algo_aead *alg) +static inline unsigned int aead_len(struct xfrm_algo_aead *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -- cgit v1.1 From 06cd22f830f28023b82455c82c7db65fc6cf9c16 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:46:30 +0300 Subject: xfrm: make xfrm_alg_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0be4c54..2abc0e1 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1769,7 +1769,7 @@ static inline unsigned int aead_len(struct xfrm_algo_aead *alg) return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -static inline int xfrm_alg_len(const struct xfrm_algo *alg) +static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -- cgit v1.1 From 1bd963a72e859d194d87a5a2a8839efee7e23102 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:47:09 +0300 Subject: xfrm: make xfrm_alg_auth_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2abc0e1..5d5e11b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1774,7 +1774,7 @@ static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg) return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -static inline int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) +static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -- cgit v1.1 From 5e708e47c44366453c33373940455a75fd33f635 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:47:50 +0300 Subject: xfrm: make xfrm_replay_state_esn_len() return unsigned int Replay detection bitmaps can't have negative length. Comparisons with nla_len() are left signed just in case negative value can sneak in there. Propagate unsignedness for code size savings: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-38 (-38) function old new delta xfrm_state_construct 1802 1800 -2 xfrm_update_ae_params 295 289 -6 xfrm_state_migrate 1345 1339 -6 xfrm_replay_notify_esn 349 337 -12 xfrm_replay_notify_bmp 345 333 -12 Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5d5e11b..3cb618b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1779,7 +1779,7 @@ static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -static inline int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn) +static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn) { return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32); } -- cgit v1.1 From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:14 -0700 Subject: tun: enable NAPI for TUN/TAP driver Changes TUN driver to use napi_gro_receive() upon receiving packets rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these changes and operation is not affected if the flag is disabled. SKBs are constructed upon packet arrival and are queued to be processed later. The new path was evaluated with a benchmark with the following setup: Open two tap devices and a receiver thread that reads in a loop for each device. Start one sender thread and pin all threads to different CPUs. Send 1M minimum UDP packets to each device and measure sending time for each of the sending methods: napi_gro_receive(): 4.90s netif_rx_ni(): 4.90s netif_receive_skb(): 7.20s Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 3cb5e1d..30b6184 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -60,6 +60,7 @@ /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 +#define IFF_NAPI 0x0010 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000 -- cgit v1.1 From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:15 -0700 Subject: tun: enable napi_gro_frags() for TUN/TAP driver Add a TUN/TAP receive mode that exercises the napi_gro_frags() interface. This mode is available only in TAP mode, as the interface expects packets with Ethernet headers. Furthermore, packets follow the layout of the iovec_iter that was received. The first iovec is the linear data, and every one after the first is a fragment. If there are more fragments than the max number, drop the packet. Additionally, invoke eth_get_headlen() to exercise flow dissector code and to verify that the header resides in the linear data. The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option. This is imposed because this mode is intended for testing via tools like syzkaller and packetdrill, and the increased flexibility it provides can introduce security vulnerabilities. This flag is accepted only if the device is in TAP mode and has the IFF_NAPI flag set as well. This is done because both of these are explicit requirements for correct operation in this mode. Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 30b6184..365ade5 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -61,6 +61,7 @@ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 #define IFF_NAPI 0x0010 +#define IFF_NAPI_FRAGS 0x0020 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000 -- cgit v1.1 From e451ae8e4f6b3f6bd3b83a5595657b5421b3bf69 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:01:06 +0300 Subject: neigh: make struct neigh_table::entry_size unsigned int Neigh entry size can't be negative. Space savings: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-7 (-7) function old new delta lowpan_neigh_construct 25 24 -1 clip_seq_sub_iter 152 151 -1 clip_ioctl 1475 1474 -1 clip_constructor 93 92 -1 __neigh_create 2455 2452 -3 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9816df2..9a25512 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -190,7 +190,7 @@ struct neigh_hash_table { struct neigh_table { int family; - int entry_size; + unsigned int entry_size; int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, -- cgit v1.1 From 01ccdf126ca5f9d4fe0889f65ee67afac910f19c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:03:04 +0300 Subject: neigh: make strucrt neigh_table::entry_size unsigned int Key length can't be negative. Leave comparisons against nla_len() signed just in case truncated attribute can sneak in there. Space savings: add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-7 (-7) function old new delta pneigh_delete 273 272 -1 mlx5e_rep_netevent_event 1415 1414 -1 mlx5e_create_encap_header_ipv6 1194 1193 -1 mlx5e_create_encap_header_ipv4 1071 1070 -1 cxgb4_l2t_get 1104 1103 -1 __pneigh_lookup 69 68 -1 __neigh_create 2452 2451 -1 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9a25512..2492000 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -191,7 +191,7 @@ struct neigh_hash_table { struct neigh_table { int family; unsigned int entry_size; - int key_len; + unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, -- cgit v1.1 From 1e99c497012cd8647972876f1bd18545bc907aea Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Sun, 24 Sep 2017 12:09:45 +0300 Subject: qed: iWARP - Add check for errors on a SYN packet A SYN packet which arrives with errors from FW should be dropped. This required adding an additional field to the ll2 rx completion data. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_ll2_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index dd7a3b8..89fa0bb 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data { void *cookie; dma_addr_t rx_buf_addr; u16 parse_flags; + u16 err_flags; u16 vlan; bool b_last_packet; u8 connection_handle; -- cgit v1.1 From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:50 +0200 Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers Just do the rename into bpf_compute_data_pointers() as we'll add one more pointer here to recompute. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/filter.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d29e58f..052bab3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -496,10 +496,13 @@ struct xdp_buff { void *data_hard_start; }; -/* compute the linear packet data range [data, data_end) which - * will be accessed by cls_bpf, act_bpf and lwt programs +/* Compute the linear packet data range [data, data_end) which + * will be accessed by various program types (cls_bpf, act_bpf, + * lwt, ...). Subsystems allowing direct data access must (!) + * ensure that cb[] area can be written to when BPF program is + * invoked (otherwise cb[] save/restore is necessary). */ -static inline void bpf_compute_data_end(struct sk_buff *skb) +static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; -- cgit v1.1 From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/linux/filter.h | 21 +++++++++++++-- include/linux/skbuff.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/bpf.h | 13 ++++++++- 4 files changed, 98 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8390859..2b672c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -137,6 +137,7 @@ enum bpf_reg_type { PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ + PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 052bab3..911d454 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -487,12 +487,14 @@ struct sk_filter { struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; + void *data_meta; void *data_end; }; struct xdp_buff { void *data; void *data_end; + void *data_meta; void *data_hard_start; }; @@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb)); - cb->data_end = skb->data + skb_headlen(skb); + cb->data_meta = skb->data - skb_metadata_len(skb); + cb->data_end = skb->data + skb_headlen(skb); } static inline u8 *bpf_skb_cb(struct sk_buff *skb) @@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + void bpf_warn_invalid_xdp_action(u32 act); -void bpf_warn_invalid_xdp_redirect(u32 ifindex); struct sock *do_sk_redirect_map(void); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f9db553..19e64bf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - unsigned short _unused; - unsigned char nr_frags; + __u8 __unused; + __u8 meta_len; + __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ @@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void) return 0; } +static inline u8 skb_metadata_len(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->meta_len; +} + +static inline void *skb_metadata_end(const struct sk_buff *skb) +{ + return skb_mac_header(skb); +} + +static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b, + u8 meta_len) +{ + const void *a = skb_metadata_end(skb_a); + const void *b = skb_metadata_end(skb_b); + /* Using more efficient varaiant than plain call to memcmp(). */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + u64 diffs = 0; + + switch (meta_len) { +#define __it(x, op) (x -= sizeof(u##op)) +#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) + case 32: diffs |= __it_diff(a, b, 64); + case 24: diffs |= __it_diff(a, b, 64); + case 16: diffs |= __it_diff(a, b, 64); + case 8: diffs |= __it_diff(a, b, 64); + break; + case 28: diffs |= __it_diff(a, b, 64); + case 20: diffs |= __it_diff(a, b, 64); + case 12: diffs |= __it_diff(a, b, 64); + case 4: diffs |= __it_diff(a, b, 32); + break; + } + return diffs; +#else + return memcmp(a - meta_len, b - meta_len, meta_len); +#endif +} + +static inline bool skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b) +{ + u8 len_a = skb_metadata_len(skb_a); + u8 len_b = skb_metadata_len(skb_b); + + if (!(len_a | len_b)) + return false; + + return len_a != len_b ? + true : __skb_metadata_differs(skb_a, skb_b, len_a); +} + +static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) +{ + skb_shinfo(skb)->meta_len = meta_len; +} + +static inline void skb_metadata_clear(struct sk_buff *skb) +{ + skb_metadata_set(skb, 0); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c4..e43491a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -582,6 +582,12 @@ union bpf_attr { * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem + * + * int bpf_xdp_adjust_meta(xdp_md, delta) + * Adjust the xdp_md.data_meta by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data_meta + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -638,6 +644,7 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ + FN(xdp_adjust_meta), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -715,7 +722,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* accessed by BPF_PROG_TYPE_sk_skb types */ + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -723,6 +730,9 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; }; struct bpf_tunnel_key { @@ -783,6 +793,7 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; + __u32 data_meta; }; enum sk_action { -- cgit v1.1 From 3b8e9238a8d194e82f0202a5fb68a63686ebe420 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Sep 2017 10:58:21 +0200 Subject: net: sched: introduce helper to identify gact pass action Introduce a helper called is_tcf_gact_pass which could be used to tell if the action is gact pass or not. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 41afe1c..d979a0d 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -33,6 +33,11 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act, return false; } +static inline bool is_tcf_gact_ok(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_OK, false); +} + static inline bool is_tcf_gact_shot(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_SHOT, false); -- cgit v1.1 From 85e482285bbbd508483cbe08de69c8fe00cdbbfe Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:11 +0200 Subject: fib: notifier: Add VIF add and delete event types In order for an interface to forward packets according to the kernel multicast routing table, it must be configured with a VIF index according to the mroute user API. The VIF index is then used to refer to that interface in the mroute user API, for example, to set the iif and oifs of an MFC entry. In order to allow drivers to be aware and offload multicast routes, they have to be aware of the VIF add and delete notifications. Due to the fact that a specific VIF can be deleted and re-added pointing to another netdevice, and the MFC routes that point to it will forward the matching packets to the new netdevice, a driver willing to offload MFC cache entries must be aware of the VIF add and delete events in addition to MFC routes notifications. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 669b971..54cd6b8 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -20,6 +20,8 @@ enum fib_event_type { FIB_EVENT_RULE_DEL, FIB_EVENT_NH_ADD, FIB_EVENT_NH_DEL, + FIB_EVENT_VIF_ADD, + FIB_EVENT_VIF_DEL, }; struct fib_notifier_ops { -- cgit v1.1 From 310ebbba3b7396b00bce08a33f1d2de2c74fa257 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:12 +0200 Subject: ipmr: Add reference count to MFC entries Next commits will introduce MFC notifications through the atomic fib_notification chain, thus allowing modules to be aware of MFC entries. Due to the fact that modules may need to hold a reference to an MFC entry, add reference count to MFC entries to prevent them from being freed while these modules use them. The reference counting is done only on resolved MFC entries currently. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index d7f6333..10028f2 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -109,6 +109,7 @@ struct mfc_cache_cmp_arg { * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array + * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction */ @@ -138,6 +139,7 @@ struct mfc_cache { unsigned long wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; + refcount_t refcount; } res; } mfc_un; struct list_head list; @@ -148,4 +150,23 @@ struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); + +#ifdef CONFIG_IP_MROUTE +void ipmr_cache_free(struct mfc_cache *mfc_cache); +#else +static inline void ipmr_cache_free(struct mfc_cache *mfc_cache) +{ +} +#endif + +static inline void ipmr_cache_put(struct mfc_cache *c) +{ + if (refcount_dec_and_test(&c->mfc_un.res.refcount)) + ipmr_cache_free(c); +} +static inline void ipmr_cache_hold(struct mfc_cache *c) +{ + refcount_inc(&c->mfc_un.res.refcount); +} + #endif -- cgit v1.1 From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:13 +0200 Subject: ipmr: Add FIB notification access functions Make the ipmr module register as a FIB notifier. To do that, implement both the ipmr_seq_read and ipmr_dump ops. The ipmr_seq_read op returns a sequence counter that is incremented on every notification related operation done by the ipmr. To implement that, add a sequence counter in the netns_ipv4 struct and increment it whenever a new MFC route or VIF are added or deleted. The sequence operations are protected by the RTNL lock. The ipmr_dump iterates the list of MFC routes and the list of VIF entries and sends notifications about them. The entries dump is done under RCU where the VIF dump uses the mrt_lock too, as the vif->dev field can change under RCU. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 15 +++++++++++++++ include/net/netns/ipv4.h | 3 +++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 10028f2..54c5cb8 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #ifdef CONFIG_IP_MROUTE @@ -58,6 +59,14 @@ struct vif_device { int link; /* Physical interface index */ }; +struct vif_entry_notifier_info { + struct fib_notifier_info info; + struct net_device *dev; + vifi_t vif_index; + unsigned short vif_flags; + u32 tb_id; +}; + #define VIFF_STATIC 0x8000 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) @@ -146,6 +155,12 @@ struct mfc_cache { struct rcu_head rcu; }; +struct mfc_entry_notifier_info { + struct fib_notifier_info info; + struct mfc_cache *mfc; + u32 tb_id; +}; + struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8387f09..abc84d9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -163,6 +163,9 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* protected by rtnl_mutex */ + struct fib_notifier_ops *ipmr_notifier_ops; + unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t rt_genid; }; #endif -- cgit v1.1 From c7c0bbeae9501a7e42f2fd306d6a6399aca688b6 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:15 +0200 Subject: net: ipmr: Add MFC offload indication Allow drivers, registered to the fib notification chain indicate whether a multicast MFC route is offloaded or not, similarly to unicast routes. The indication of whether a route is offloaded is done using the mfc_flags field on an mfc_cache struct, and the information is sent to the userspace via the RTNetlink interface only. Currently, MFC routes are either offloaded or not, thus there is no need to add per-VIF offload indication. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 54c5cb8..5566580 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -90,9 +90,11 @@ struct mr_table { /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) + * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), + MFC_OFFLOAD = BIT(1), }; struct mfc_cache_cmp_arg { -- cgit v1.1 From 478e4c2f0067d57d7c17059caafab026ca32084a Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:16 +0200 Subject: net: mroute: Check if rule is a default rule When the ipmr starts, it adds one default FIB rule that matches all packets and sends them to the DEFAULT (multicast) FIB table. A more complex rule can be added by user to specify that for a specific interface, a packet should be look up at either an arbitrary table or according to the l3mdev of the interface. For drivers willing to offload the ipmr logic into a hardware but don't want to offload all the FIB rules functionality, provide a function that can indicate whether the FIB rule is the default multicast rule, thus only one routing table is needed. This way, a driver can register to the FIB notification chain, get notifications about FIB rules added and trigger some kind of an internal abort mechanism when a non default rule is added by the user. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 5566580..b072a84 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); +bool ipmr_rule_default(const struct fib_rule *rule); #else static inline int ip_mroute_setsockopt(struct sock *sock, int optname, char __user *optval, unsigned int optlen) @@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt) { return 0; } + +static inline bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} #endif struct vif_device { -- cgit v1.1 From 6ade97da601f8af793f6c7a861af754d0f0b6767 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 26 Sep 2017 23:12:28 +0300 Subject: arp: make arp_hdr_len() return unsigned int Negative ARP header length are not a thing. Constify arguments while I'm at it. Space savings: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3) function old new delta arpt_do_table 1163 1160 -3 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/if_arp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index 3355efc..6756fea 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb) return (struct arphdr *)skb_network_header(skb); } -static inline int arp_hdr_len(struct net_device *dev) +static inline unsigned int arp_hdr_len(const struct net_device *dev) { switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) -- cgit v1.1 From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 27 Sep 2017 16:12:44 +0300 Subject: net: bridge: add per-port group_fwd_mask with less restrictions We need to be able to transparently forward most link-local frames via tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a mask which restricts the forwarding of STP and LACP, but we need to be able to forward these over tunnels and control that forwarding on a per-port basis thus add a new per-port group_fwd_mask option which only disallows mac pause frames to be forwarded (they're always dropped anyway). The patch does not change the current default situation - all of the others are still restricted unless configured for forwarding. We have successfully tested this patch with LACP and STP forwarding over VxLAN and qinq tunnels. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8d062c5..ea87bd7 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -325,6 +325,7 @@ enum { IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.1 From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:52 -0700 Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info The patch adds name and load_time to struct bpf_prog_aux. They are also exported to bpf_prog_info. The bpf_prog's name is passed by userspace during BPF_PROG_LOAD. The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes and the name stored in the kernel is always \0 terminated. The kernel will reject name that contains characters other than isalnum() and '_'. It will also reject name that is not null terminated. The existing 'user->uid' of the bpf_prog_aux is also exported to the bpf_prog_info as created_by_uid. The existing 'used_maps' of the bpf_prog_aux is exported to the newly added members 'nr_map_ids' and 'map_ids' of the bpf_prog_info. On the input, nr_map_ids tells how big the userspace's map_ids buffer is. On the output, nr_map_ids tells the exact user_map_cnt and it will only copy up to the userspace's map_ids buffer is allowed. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b672c5..33ccc47 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -187,6 +187,8 @@ struct bpf_prog_aux { struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; + u64 load_time; /* ns since boottime */ + u8 name[BPF_OBJ_NAME_LEN]; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e43491a..bd63482 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -175,6 +175,8 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -210,6 +212,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; + __u8 prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -812,6 +815,11 @@ struct bpf_prog_info { __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.1 From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:53 -0700 Subject: bpf: Add map_name to bpf_map_info This patch allows userspace to specify a name for a map during BPF_MAP_CREATE. The map's name can later be exported to user space via BPF_OBJ_GET_INFO_BY_FD. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33ccc47..252f4bc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,6 +56,7 @@ struct bpf_map { struct work_struct work; atomic_t usercnt; struct bpf_map *inner_map_meta; + u8 name[BPF_OBJ_NAME_LEN]; }; /* function argument constraints */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd63482..6d2137b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -190,6 +190,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ + __u8 map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -829,6 +830,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.1 From c7c3e5913bf18eda3cf38932bebdce48351baac9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Sep 2017 19:08:00 -0700 Subject: net: ipv4: remove fib_weight fib_weight in fib_info is set but not used. Remove it and the helpers for setting it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1a7f7e4..f805243 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -122,9 +122,6 @@ struct fib_info { #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int fib_weight; -#endif struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev -- cgit v1.1 From a3f5aa907340b5d7b54223ddbaa90410f168864d Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Fri, 14 Jul 2017 09:27:08 -0400 Subject: i40e: Enable VF to negotiate number of allocated queues Currently the PF allocates a default number of queues for each VF and cannot be changed. This patch enables the VF to request a different number of queues allocated to it. This patch also adds a new virtchnl op and capability flag to facilitate this negotiation. After the PF receives a request message, it will set a requested number of queues for that VF. Then when the VF resets, its VSI will get a new number of queues allocated to it. This is a best effort request and since we only allocate a guaranteed default number, if the VF tries to ask for more than the guaranteed number, there may not be enough in HW to accommodate it unless other queues for other VFs are freed. It should also be noted decreasing the number queues allocated to a VF to below the default will NOT enable the allocation of more than 32 VFs per PF and will not free queues guaranteed to each VF by default. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2b03844..60e5d90 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -135,6 +135,7 @@ enum virtchnl_ops { VIRTCHNL_OP_SET_RSS_HENA = 26, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28, + VIRTCHNL_OP_REQUEST_QUEUES = 29, }; /* This macro is used to generate a compilation error if a structure @@ -235,6 +236,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RSS_AQ 0x00000008 #define VIRTCHNL_VF_OFFLOAD_RSS_REG 0x00000010 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR 0x00000020 +#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES 0x00000040 #define VIRTCHNL_VF_OFFLOAD_VLAN 0x00010000 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING 0x00020000 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 0x00040000 @@ -325,6 +327,21 @@ struct virtchnl_vsi_queue_config_info { struct virtchnl_queue_pair_info qpair[1]; }; +/* VIRTCHNL_OP_REQUEST_QUEUES + * VF sends this message to request the PF to allocate additional queues to + * this VF. Each VF gets a guaranteed number of queues on init but asking for + * additional queues must be negotiated. This is a best effort request as it + * is possible the PF does not have enough queues left to support the request. + * If the PF cannot support the number requested it will respond with the + * maximum number it is able to support; otherwise it will respond with the + * number requested. + */ + +/* VF resource request */ +struct virtchnl_vf_res_request { + u16 num_queue_pairs; +}; + VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info); /* VIRTCHNL_OP_CONFIG_IRQ_MAP @@ -691,6 +708,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: break; + case VIRTCHNL_OP_REQUEST_QUEUES: + valid_len = sizeof(struct virtchnl_vf_res_request); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.1 From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 26 Sep 2017 21:32:42 -0700 Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable even on IPv6 sockets. However, it turns out it is perfectly reasonable to want to set freebind on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP socket option on such a socket (they're all blindly errored out). One use case for this is to allow spoofing src ip on a raw socket via sendmsg cmsg. Tested: built, and booted # python >>> import socket >>> SOL_IP = socket.SOL_IP >>> SOL_IPV6 = socket.IPPROTO_IPV6 >>> IP_FREEBIND = 15 >>> IPV6_FREEBIND = 78 >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 0 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 0 >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 1 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 1 Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 46444f8..4f8f3eb 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -284,6 +284,7 @@ struct in6_flowlabel_req { #define IPV6_TRANSPARENT 75 #define IPV6_UNICAST_IF 76 #define IPV6_RECVFRAGSIZE 77 +#define IPV6_FREEBIND 78 /* * Multicast Routing: -- cgit v1.1 From 152402483ed75b167d5628d414e876ffa7a6d4c4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:18 -0400 Subject: net: dsa: add tagging ops to port The DSA tagging protocol operations are specific to each CPU port, thus the dsa_device_ops pointer belongs to the dsa_port structure. >From now on assign a slave's xmit copy from its CPU port tagging operations. This will ease the future support for multiple CPU ports. Also keep the tag_ops at the beginning of the dsa_port structure so that we ensure copies for hot path are in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8dee216..4d1df2f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -175,6 +175,9 @@ struct dsa_mall_tc_entry { struct dsa_port { + /* CPU port tagging operations used by master or slave devices */ + const struct dsa_device_ops *tag_ops; + struct dsa_switch *ds; unsigned int index; const char *name; -- cgit v1.1 From 3e41f93b358a8800194b87995ad076fc50919719 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:19 -0400 Subject: net: dsa: prepare master receive hot path In preparation to make DSA master devices point to their corresponding CPU port instead of the whole tree, add copies of dst and rcv in the dsa_port structure so that we keep fast access in the receive hot path. Also keep the copies at the beginning of the dsa_port structure in order to ensure they are available in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4d1df2f..6bda01f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -178,6 +178,11 @@ struct dsa_port { /* CPU port tagging operations used by master or slave devices */ const struct dsa_device_ops *tag_ops; + /* Copies for faster access in master receive hot path */ + struct dsa_switch_tree *dst; + struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt); + struct dsa_switch *ds; unsigned int index; const char *name; -- cgit v1.1 From 2f657a600409f1961d67642fe384a9d4be71d36a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:20 -0400 Subject: net: dsa: change dsa_ptr for a dsa_port With DSA, a master net device (CPU facing interface) has a dsa_ptr pointer to which hangs a dsa_switch_tree. This is not correct because a master interface is wired to a dedicated switch port, and because we can theoretically have several master interfaces pointing to several CPU ports of the same switch fabric. Change the master interface's dsa_ptr for the CPU dsa_port pointer. This is a step towards supporting multiple CPU ports. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f535779..e1d6ef1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -55,7 +55,7 @@ struct netpoll_info; struct device; struct phy_device; -struct dsa_switch_tree; +struct dsa_port; /* 802.11 specific */ struct wireless_dev; @@ -1752,7 +1752,7 @@ struct net_device { struct vlan_info __rcu *vlan_info; #endif #if IS_ENABLED(CONFIG_NET_DSA) - struct dsa_switch_tree *dsa_ptr; + struct dsa_port *dsa_ptr; #endif #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; -- cgit v1.1 From aa193d9b1d7ea6893ce24a9d141f676950563987 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:21 -0400 Subject: net: dsa: remove tag ops from the switch tree Now that the dsa_ptr is a dsa_port instance, there is no need to keep the tag operations in the dsa_switch_tree structure. Remove it. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6bda01f..10dcecc 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -130,11 +130,6 @@ struct dsa_switch_tree { */ struct dsa_platform_data *pd; - /* Copy of tag_ops->rcv for faster access in hot path */ - struct sk_buff * (*rcv)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt); - /* * The switch port to which the CPU is attached. */ @@ -144,12 +139,6 @@ struct dsa_switch_tree { * Data for the individual switch chips. */ struct dsa_switch *ds[DSA_MAX_SWITCHES]; - - /* - * Tagging protocol operations for adding and removing an - * encapsulation tag. - */ - const struct dsa_device_ops *tag_ops; }; /* TC matchall action types, only mirroring for now */ -- cgit v1.1 From e1cfcbe82b4534bd0f99fef92a6d33843fd85e0e Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:40 +0800 Subject: ipv4: Namespaceify tcp_fastopen knob Different namespace application might require enable TCP Fast Open feature independently of the host. This patch series continues making more of the TCP Fast Open related sysctl knobs be per net-namespace. Reported-by: Luca BRUNO Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index abc84d9..16420cc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -128,6 +128,7 @@ struct netns_ipv4 { int sysctl_tcp_timestamps; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; + int sysctl_tcp_fastopen; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index 770b608..9e414a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; -- cgit v1.1 From dd000598a39b6937fcefdf143720ec9fb5250e72 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:41 +0800 Subject: ipv4: Remove the 'publish' logic in tcp_fastopen_init_key_once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'publish' logic is not necessary after commit dfea2aa65424 ("tcp: Do not call tcp_fastopen_reset_cipher from interrupt context"), because in tcp_fastopen_cookie_gen,it wouldn't call tcp_fastopen_init_key_once. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9e414a9..d9376e2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1555,7 +1555,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(bool publish); +void tcp_fastopen_init_key_once(void); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); -- cgit v1.1 From 437138485656c41e32b8c63c0987cfa0348be0e6 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:42 +0800 Subject: ipv4: Namespaceify tcp_fastopen_key knob Different namespace application might require different tcp_fastopen_key independently of the host. David Miller pointed out there is a leak without releasing the context of tcp_fastopen_key during netns teardown. So add the release action in exit_batch path. Tested: 1. Container namespace: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 2817fff2-f803cf97-eadfd1f3-78c0992b cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: 1e5dd82a8c492ca9 2. Host: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 107d7c5f-68eb2ac7-02fb06e6-ed341702 cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: e213c02bf0afbc8a Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 ++++ include/net/tcp.h | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 16420cc..7bb9603 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -36,6 +36,8 @@ struct inet_timewait_death_row { int sysctl_max_tw_buckets; }; +struct tcp_fastopen_context; + struct netns_ipv4 { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; @@ -129,6 +131,8 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; + spinlock_t tcp_fastopen_ctx_lock; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index d9376e2..6d25d83 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1549,13 +1549,13 @@ struct tcp_fastopen_request { }; void tcp_free_fastopen_req(struct tcp_sock *tp); -extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; -int tcp_fastopen_reset_cipher(void *key, unsigned int len); +void tcp_fastopen_ctx_destroy(struct net *net); +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(void); +void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); -- cgit v1.1 From 3733be14a32bae288b61ed28341e593baba983af Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:43 +0800 Subject: ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob Different namespace application might require different time period in second to disable Fastopen on active TCP sockets. Tested: Simulate following similar situation that the server's data gets dropped after 3WHS. C ---- syn-data ---> S C <--- syn/ack ----- S C ---- ack --------> S S (accept & write) C? X <- data ------ S [retry and timeout] And then print netstat of TCPFastOpenBlackhole, the counter increased as expected when the firewall blackhole issue is detected and active TFO is disabled. # cat /proc/net/netstat | awk '{print $91}' TCPFastOpenBlackhole 1 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7bb9603..2c4222a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -133,6 +133,9 @@ struct netns_ipv4 { int sysctl_tcp_fastopen; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; + unsigned int sysctl_tcp_fastopen_blackhole_timeout; + atomic_t tfo_active_disable_times; + unsigned long tfo_active_disable_stamp; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; -- cgit v1.1 From b80ccfe9bbcac70e66fdfaef73f0988a27f9a68c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 26 Sep 2017 20:37:22 -0700 Subject: net-ipv6: remove unused IP6_ECN_clear() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is unused, and furthermore it is buggy since it suffers from the same issue that requires IP6_ECN_set_ce() to take a pointer to the skb so that it may (in case of CHECKSUM_COMPLETE) update skb->csum Instead of fixing it, let's just outright remove it. Tested: builds, and 'git grep IP6_ECN_clear' comes up empty Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index dce2d58..f5ff16d 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -133,11 +133,6 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) return 1; } -static inline void IP6_ECN_clear(struct ipv6hdr *iph) -{ - *(__be32*)iph &= ~htonl(INET_ECN_MASK << 20); -} - static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; -- cgit v1.1 From 66b1bedf662518e9b6367990a87e9601b35a94c1 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 29 Sep 2017 14:21:14 +0300 Subject: ieee80211: Add WFA TPC report element OUI type Add Transmit Power Control OUI type definition for WLAN_OUI_MICROSOFT. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55a604a..ee6657a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2445,6 +2445,7 @@ enum ieee80211_sa_query_action { #define WLAN_OUI_TYPE_MICROSOFT_WPA 1 #define WLAN_OUI_TYPE_MICROSOFT_WMM 2 #define WLAN_OUI_TYPE_MICROSOFT_WPS 4 +#define WLAN_OUI_TYPE_MICROSOFT_TPC 8 /* * WMM/802.11e Tspec Element -- cgit v1.1 From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 29 Sep 2017 14:21:49 +0200 Subject: cfg80211/nl80211: add a port authorized event Add an event that indicates that a connection is authorized (i.e. the 4 way handshake was performed by the driver). This event should be sent by the driver after sending a connect/roamed event. This is useful for networks that require 802.1X authentication. In cases that the driver supports 4 way handshake offload, but the 802.1X authentication is managed by user space, the driver needs to inform user space right after the 802.11 association was completed so user space can initialize its 802.1X state machine etc. However, it is also possible that the AP will choose to skip the 802.1X authentication (e.g. when PMKSA caching is used) and proceed with the 4 way handshake immediately. In this case the driver needs to inform user space that 802.1X authentication is no longer required (e.g. to prevent user space from disconnecting since it did not get any EAPOLs from the AP). This is also useful for roaming, in which case it is possible that the driver used the Fast Transition protocol so 802.1X is not required. Since there will now be a dedicated notification indicating that the connection is authorized, the authorized flag can be removed from the roamed event. Drivers can send the new port authorized event right after sending the roamed event to indicate the new AP is already authorized. This therefore reserves the old PORT_AUTHORIZED attribute. Signed-off-by: Avraham Stern Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 21 +++++++++++++++++---- include/uapi/linux/nl80211.h | 28 +++++++++++++++++----------- 2 files changed, 34 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cc19960..8b8118a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5428,9 +5428,6 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length - * @authorized: true if the 802.1X authentication was done by the driver or is - * not needed (e.g., when Fast Transition protocol was used), false - * otherwise. Ignored for networks that don't use 802.1X authentication. */ struct cfg80211_roam_info { struct ieee80211_channel *channel; @@ -5440,7 +5437,6 @@ struct cfg80211_roam_info { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; - bool authorized; }; /** @@ -5465,6 +5461,23 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp); /** + * cfg80211_port_authorized - notify cfg80211 of successful security association + * + * @dev: network device + * @bssid: the BSSID of the AP + * @gfp: allocation flags + * + * This function should be called by a driver that supports 4 way handshake + * offload after a security association was successfully established (i.e., + * the 4 way handshake was completed successfully). The call to this function + * should be preceded with a call to cfg80211_connect_result(), + * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to + * indicate the 802.11 association. + */ +void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, + gfp_t gfp); + +/** * cfg80211_disconnected - notify cfg80211 that connection was dropped * * @dev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 59ba6ca..95832ce 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -569,13 +569,14 @@ * authentication/association or not receiving a response from the AP. * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as * well to remain backwards compatible. - * @NL80211_CMD_ROAM: notifcation indicating the card/driver roamed by itself. - * When the driver roamed in a network that requires 802.1X authentication, - * %NL80211_ATTR_PORT_AUTHORIZED should be set if the 802.1X authentication - * was done by the driver or if roaming was done using Fast Transition - * protocol (in which case 802.1X authentication is not needed). If - * %NL80211_ATTR_PORT_AUTHORIZED is not set, user space is responsible for - * the 802.1X authentication. + * When establishing a security association, drivers that support 4 way + * handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when + * the 4 way handshake is completed successfully. + * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. + * When a security association was established with the new AP (e.g. if + * the FT protocol was used for roaming or the driver completed the 4 way + * handshake), this event should be followed by an + * %NL80211_CMD_PORT_AUTHORIZED event. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other * reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and @@ -982,6 +983,12 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * &NL80211_ATTR_MAC. + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way + * handshake was completed successfully by the driver. The BSSID is + * specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake + * offload should send this event after indicating 802.11 association with + * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed + * &NL80211_CMD_DISCONNECT should be indicated instead. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1185,6 +1192,8 @@ enum nl80211_commands { NL80211_CMD_SET_PMK, NL80211_CMD_DEL_PMK, + NL80211_CMD_PORT_AUTHORIZED, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2138,10 +2147,7 @@ enum nl80211_commands { * in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it * wants to use the supported offload of the 4-way handshake. * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT. - * @NL80211_ATTR_PORT_AUTHORIZED: flag attribute used in %NL80211_CMD_ROAMED - * notification indicating that that 802.1X authentication was done by - * the driver or is not needed (because roaming used the Fast Transition - * protocol). + * @NL80211_ATTR_PORT_AUTHORIZED: (reserved) * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined -- cgit v1.1 From 32f16369e59fcc505c5ed93a6a8cad3d5636b463 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2017 10:41:15 +0200 Subject: net/dst: Make skb parameter of skb{metadata_dst, tunnel_info}() const Make the skb parameter of skb_metadata_dst() and skb_tunnel_info() const as they are not modified. This is in preparation for using them in call-sites where skb is const. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index a803129..9fba2eb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -24,7 +24,7 @@ struct metadata_dst { } u; }; -static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) +static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); @@ -34,7 +34,8 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +static inline struct ip_tunnel_info * +skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; -- cgit v1.1 From f2f2efb807d339513199b1bb771806c90cce83ae Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:28 +0300 Subject: byteorder: Move {cpu_to_be32, be32_to_cpu}_array() from Thunderbolt to core We will be using these when communicating XDomain discovery protocol over Thunderbolt link but they might be useful for other drivers as well. Make them available through byteorder/generic.h. Suggested-by: Andy Shevchenko Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/byteorder/generic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index 89f67c1..805d166 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -170,4 +170,20 @@ static inline void be64_add_cpu(__be64 *var, u64 val) *var = cpu_to_be64(be64_to_cpu(*var) + val); } +static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) +{ + int i; + + for (i = 0; i < len; i++) + dst[i] = cpu_to_be32(src[i]); +} + +static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len) +{ + int i; + + for (i = 0; i < len; i++) + dst[i] = be32_to_cpu(src[i]); +} + #endif /* _LINUX_BYTEORDER_GENERIC_H */ -- cgit v1.1 From cdae7c07e3e3509eaabc18c1640a55dc5b99c179 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:30 +0300 Subject: thunderbolt: Add support for XDomain properties Thunderbolt XDomain discovery protocol uses directories which contain properties and other directories to exchange information about what capabilities the remote host supports. This also includes identification information like device ID and name. This adds support for parsing and formatting these properties and establishes an API drivers can use in addition to the core Thunderbolt driver. This API is exposed in a new header: include/linux/thunderbolt.h. This code is based on the work done by Amir Levy and Michael Jamet. Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 89 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 include/linux/thunderbolt.h (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h new file mode 100644 index 0000000..96561c1 --- /dev/null +++ b/include/linux/thunderbolt.h @@ -0,0 +1,89 @@ +/* + * Thunderbolt service API + * + * Copyright (C) 2017, Intel Corporation + * Authors: Michael Jamet + * Mika Westerberg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef THUNDERBOLT_H_ +#define THUNDERBOLT_H_ + +#include +#include + +/** + * struct tb_property_dir - XDomain property directory + * @uuid: Directory UUID or %NULL if root directory + * @properties: List of properties in this directory + * + * User needs to provide serialization if needed. + */ +struct tb_property_dir { + const uuid_t *uuid; + struct list_head properties; +}; + +enum tb_property_type { + TB_PROPERTY_TYPE_UNKNOWN = 0x00, + TB_PROPERTY_TYPE_DIRECTORY = 0x44, + TB_PROPERTY_TYPE_DATA = 0x64, + TB_PROPERTY_TYPE_TEXT = 0x74, + TB_PROPERTY_TYPE_VALUE = 0x76, +}; + +#define TB_PROPERTY_KEY_SIZE 8 + +/** + * struct tb_property - XDomain property + * @list: Used to link properties together in a directory + * @key: Key for the property (always terminated). + * @type: Type of the property + * @length: Length of the property data in dwords + * @value: Property value + * + * Users use @type to determine which field in @value is filled. + */ +struct tb_property { + struct list_head list; + char key[TB_PROPERTY_KEY_SIZE + 1]; + enum tb_property_type type; + size_t length; + union { + struct tb_property_dir *dir; + u8 *data; + char *text; + u32 immediate; + } value; +}; + +struct tb_property_dir *tb_property_parse_dir(const u32 *block, + size_t block_len); +ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block, + size_t block_len); +struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid); +void tb_property_free_dir(struct tb_property_dir *dir); +int tb_property_add_immediate(struct tb_property_dir *parent, const char *key, + u32 value); +int tb_property_add_data(struct tb_property_dir *parent, const char *key, + const void *buf, size_t buflen); +int tb_property_add_text(struct tb_property_dir *parent, const char *key, + const char *text); +int tb_property_add_dir(struct tb_property_dir *parent, const char *key, + struct tb_property_dir *dir); +void tb_property_remove(struct tb_property *tb_property); +struct tb_property *tb_property_find(struct tb_property_dir *dir, + const char *key, enum tb_property_type type); +struct tb_property *tb_property_get_next(struct tb_property_dir *dir, + struct tb_property *prev); + +#define tb_property_for_each(dir, property) \ + for (property = tb_property_get_next(dir, NULL); \ + property; \ + property = tb_property_get_next(dir, property)) + +#endif /* THUNDERBOLT_H_ */ -- cgit v1.1 From eaf8ff35a345449207ad116e2574c19780ec9a98 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:31 +0300 Subject: thunderbolt: Move enum tb_cfg_pkg_type to thunderbolt.h These will be needed by Thunderbolt services when sending and receiving XDomain control messages. While there change TB_CFG_PKG_PREPARE_TO_SLEEP value to be decimal in order to be consistent with other members. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 96561c1..b512b1e 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -1,6 +1,7 @@ /* * Thunderbolt service API * + * Copyright (C) 2014 Andreas Noever * Copyright (C) 2017, Intel Corporation * Authors: Michael Jamet * Mika Westerberg @@ -16,6 +17,22 @@ #include #include +enum tb_cfg_pkg_type { + TB_CFG_PKG_READ = 1, + TB_CFG_PKG_WRITE = 2, + TB_CFG_PKG_ERROR = 3, + TB_CFG_PKG_NOTIFY_ACK = 4, + TB_CFG_PKG_EVENT = 5, + TB_CFG_PKG_XDOMAIN_REQ = 6, + TB_CFG_PKG_XDOMAIN_RESP = 7, + TB_CFG_PKG_OVERRIDE = 8, + TB_CFG_PKG_RESET = 9, + TB_CFG_PKG_ICM_EVENT = 10, + TB_CFG_PKG_ICM_CMD = 11, + TB_CFG_PKG_ICM_RESP = 12, + TB_CFG_PKG_PREPARE_TO_SLEEP = 13, +}; + /** * struct tb_property_dir - XDomain property directory * @uuid: Directory UUID or %NULL if root directory -- cgit v1.1 From 9e99b9f4d5c36340dabda6d14053195b2a43796b Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:32 +0300 Subject: thunderbolt: Move thunderbolt domain structure to thunderbolt.h These are needed by Thunderbolt services so move them to thunderbolt.h to make sure they are available outside of drivers/thunderbolt. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index b512b1e..910b1bf 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -14,7 +14,9 @@ #ifndef THUNDERBOLT_H_ #define THUNDERBOLT_H_ +#include #include +#include #include enum tb_cfg_pkg_type { @@ -34,6 +36,49 @@ enum tb_cfg_pkg_type { }; /** + * enum tb_security_level - Thunderbolt security level + * @TB_SECURITY_NONE: No security, legacy mode + * @TB_SECURITY_USER: User approval required at minimum + * @TB_SECURITY_SECURE: One time saved key required at minimum + * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB) + */ +enum tb_security_level { + TB_SECURITY_NONE, + TB_SECURITY_USER, + TB_SECURITY_SECURE, + TB_SECURITY_DPONLY, +}; + +/** + * struct tb - main thunderbolt bus structure + * @dev: Domain device + * @lock: Big lock. Must be held when accessing any struct + * tb_switch / struct tb_port. + * @nhi: Pointer to the NHI structure + * @ctl: Control channel for this domain + * @wq: Ordered workqueue for all domain specific work + * @root_switch: Root switch of this domain + * @cm_ops: Connection manager specific operations vector + * @index: Linux assigned domain number + * @security_level: Current security level + * @privdata: Private connection manager specific data + */ +struct tb { + struct device dev; + struct mutex lock; + struct tb_nhi *nhi; + struct tb_ctl *ctl; + struct workqueue_struct *wq; + struct tb_switch *root_switch; + const struct tb_cm_ops *cm_ops; + int index; + enum tb_security_level security_level; + unsigned long privdata[0]; +}; + +extern struct bus_type tb_bus_type; + +/** * struct tb_property_dir - XDomain property directory * @uuid: Directory UUID or %NULL if root directory * @properties: List of properties in this directory -- cgit v1.1 From e69b71f8458b78a2ef44e3d07374a8f46e45123d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:33 +0300 Subject: thunderbolt: Move tb_switch_phy_port_from_link() to thunderbolt.h A Thunderbolt service might need to find the physical port from a link the cable is connected to. For instance networking driver uses this information to generate MAC address according the Apple ThunderboltIP protocol. Move this function to thunderbolt.h and rename it to tb_phy_port_from_link() to reflect the fact that it does not take switch as parameter. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 910b1bf..43b8d1e 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -78,6 +78,13 @@ struct tb { extern struct bus_type tb_bus_type; +#define TB_LINKS_PER_PHY_PORT 2 + +static inline unsigned int tb_phy_port_from_link(unsigned int link) +{ + return (link - 1) / TB_LINKS_PER_PHY_PORT; +} + /** * struct tb_property_dir - XDomain property directory * @uuid: Directory UUID or %NULL if root directory -- cgit v1.1 From d1ff70241a275133e1a0258b7c23588b122276c8 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:34 +0300 Subject: thunderbolt: Add support for XDomain discovery protocol When two hosts are connected over a Thunderbolt cable, there is a protocol they can use to communicate capabilities supported by the host. The discovery protocol uses automatically configured control channel (ring 0) and is build on top of request/response transactions using special XDomain primitives provided by the Thunderbolt base protocol. The capabilities consists of a root directory block of basic properties used for identification of the host, and then there can be zero or more directories each describing a Thunderbolt service and its capabilities. Once both sides have discovered what is supported the two hosts can setup high-speed DMA paths and transfer data to the other side using whatever protocol was agreed based on the properties. The software protocol used to communicate which DMA paths to enable is service specific. This patch adds support for the XDomain discovery protocol to the Thunderbolt bus. We model each remote host connection as a Linux XDomain device. For each Thunderbolt service found supported on the XDomain device, we create Linux Thunderbolt service device which Thunderbolt service drivers can then bind to based on the protocol identification information retrieved from the property directory describing the service. This code is based on the work done by Amir Levy and Michael Jamet. Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/mod_devicetable.h | 26 +++++ include/linux/thunderbolt.h | 242 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 268 insertions(+) (limited to 'include') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 694cebb..7625c3b 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -683,5 +683,31 @@ struct fsl_mc_device_id { const char obj_type[16]; }; +/** + * struct tb_service_id - Thunderbolt service identifiers + * @match_flags: Flags used to match the structure + * @protocol_key: Protocol key the service supports + * @protocol_id: Protocol id the service supports + * @protocol_version: Version of the protocol + * @protocol_revision: Revision of the protocol software + * @driver_data: Driver specific data + * + * Thunderbolt XDomain services are exposed as devices where each device + * carries the protocol information the service supports. Thunderbolt + * XDomain service drivers match against that information. + */ +struct tb_service_id { + __u32 match_flags; + char protocol_key[8 + 1]; + __u32 protocol_id; + __u32 protocol_version; + __u32 protocol_revision; + kernel_ulong_t driver_data; +}; + +#define TBSVC_MATCH_PROTOCOL_KEY 0x0001 +#define TBSVC_MATCH_PROTOCOL_ID 0x0002 +#define TBSVC_MATCH_PROTOCOL_VERSION 0x0004 +#define TBSVC_MATCH_PROTOCOL_REVISION 0x0008 #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 43b8d1e..18c0e3d 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -17,6 +17,7 @@ #include #include #include +#include #include enum tb_cfg_pkg_type { @@ -77,6 +78,8 @@ struct tb { }; extern struct bus_type tb_bus_type; +extern struct device_type tb_service_type; +extern struct device_type tb_xdomain_type; #define TB_LINKS_PER_PHY_PORT 2 @@ -155,4 +158,243 @@ struct tb_property *tb_property_get_next(struct tb_property_dir *dir, property; \ property = tb_property_get_next(dir, property)) +int tb_register_property_dir(const char *key, struct tb_property_dir *dir); +void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir); + +/** + * struct tb_xdomain - Cross-domain (XDomain) connection + * @dev: XDomain device + * @tb: Pointer to the domain + * @remote_uuid: UUID of the remote domain (host) + * @local_uuid: Cached local UUID + * @route: Route string the other domain can be reached + * @vendor: Vendor ID of the remote domain + * @device: Device ID of the demote domain + * @lock: Lock to serialize access to the following fields of this structure + * @vendor_name: Name of the vendor (or %NULL if not known) + * @device_name: Name of the device (or %NULL if not known) + * @is_unplugged: The XDomain is unplugged + * @resume: The XDomain is being resumed + * @transmit_path: HopID which the remote end expects us to transmit + * @transmit_ring: Local ring (hop) where outgoing packets are pushed + * @receive_path: HopID which we expect the remote end to transmit + * @receive_ring: Local ring (hop) where incoming packets arrive + * @service_ids: Used to generate IDs for the services + * @properties: Properties exported by the remote domain + * @property_block_gen: Generation of @properties + * @properties_lock: Lock protecting @properties. + * @get_properties_work: Work used to get remote domain properties + * @properties_retries: Number of times left to read properties + * @properties_changed_work: Work used to notify the remote domain that + * our properties have changed + * @properties_changed_retries: Number of times left to send properties + * changed notification + * @link: Root switch link the remote domain is connected (ICM only) + * @depth: Depth in the chain the remote domain is connected (ICM only) + * + * This structure represents connection across two domains (hosts). + * Each XDomain contains zero or more services which are exposed as + * &struct tb_service objects. + * + * Service drivers may access this structure if they need to enumerate + * non-standard properties but they need hold @lock when doing so + * because properties can be changed asynchronously in response to + * changes in the remote domain. + */ +struct tb_xdomain { + struct device dev; + struct tb *tb; + uuid_t *remote_uuid; + const uuid_t *local_uuid; + u64 route; + u16 vendor; + u16 device; + struct mutex lock; + const char *vendor_name; + const char *device_name; + bool is_unplugged; + bool resume; + u16 transmit_path; + u16 transmit_ring; + u16 receive_path; + u16 receive_ring; + struct ida service_ids; + struct tb_property_dir *properties; + u32 property_block_gen; + struct delayed_work get_properties_work; + int properties_retries; + struct delayed_work properties_changed_work; + int properties_changed_retries; + u8 link; + u8 depth; +}; + +int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path, + u16 transmit_ring, u16 receive_path, + u16 receive_ring); +int tb_xdomain_disable_paths(struct tb_xdomain *xd); +struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid); + +static inline struct tb_xdomain * +tb_xdomain_find_by_uuid_locked(struct tb *tb, const uuid_t *uuid) +{ + struct tb_xdomain *xd; + + mutex_lock(&tb->lock); + xd = tb_xdomain_find_by_uuid(tb, uuid); + mutex_unlock(&tb->lock); + + return xd; +} + +static inline struct tb_xdomain *tb_xdomain_get(struct tb_xdomain *xd) +{ + if (xd) + get_device(&xd->dev); + return xd; +} + +static inline void tb_xdomain_put(struct tb_xdomain *xd) +{ + if (xd) + put_device(&xd->dev); +} + +static inline bool tb_is_xdomain(const struct device *dev) +{ + return dev->type == &tb_xdomain_type; +} + +static inline struct tb_xdomain *tb_to_xdomain(struct device *dev) +{ + if (tb_is_xdomain(dev)) + return container_of(dev, struct tb_xdomain, dev); + return NULL; +} + +int tb_xdomain_response(struct tb_xdomain *xd, const void *response, + size_t size, enum tb_cfg_pkg_type type); +int tb_xdomain_request(struct tb_xdomain *xd, const void *request, + size_t request_size, enum tb_cfg_pkg_type request_type, + void *response, size_t response_size, + enum tb_cfg_pkg_type response_type, + unsigned int timeout_msec); + +/** + * tb_protocol_handler - Protocol specific handler + * @uuid: XDomain messages with this UUID are dispatched to this handler + * @callback: Callback called with the XDomain message. Returning %1 + * here tells the XDomain core that the message was handled + * by this handler and should not be forwared to other + * handlers. + * @data: Data passed with the callback + * @list: Handlers are linked using this + * + * Thunderbolt services can hook into incoming XDomain requests by + * registering protocol handler. Only limitation is that the XDomain + * discovery protocol UUID cannot be registered since it is handled by + * the core XDomain code. + * + * The @callback must check that the message is really directed to the + * service the driver implements. + */ +struct tb_protocol_handler { + const uuid_t *uuid; + int (*callback)(const void *buf, size_t size, void *data); + void *data; + struct list_head list; +}; + +int tb_register_protocol_handler(struct tb_protocol_handler *handler); +void tb_unregister_protocol_handler(struct tb_protocol_handler *handler); + +/** + * struct tb_service - Thunderbolt service + * @dev: XDomain device + * @id: ID of the service (shown in sysfs) + * @key: Protocol key from the properties directory + * @prtcid: Protocol ID from the properties directory + * @prtcvers: Protocol version from the properties directory + * @prtcrevs: Protocol software revision from the properties directory + * @prtcstns: Protocol settings mask from the properties directory + * + * Each domain exposes set of services it supports as collection of + * properties. For each service there will be one corresponding + * &struct tb_service. Service drivers are bound to these. + */ +struct tb_service { + struct device dev; + int id; + const char *key; + u32 prtcid; + u32 prtcvers; + u32 prtcrevs; + u32 prtcstns; +}; + +static inline struct tb_service *tb_service_get(struct tb_service *svc) +{ + if (svc) + get_device(&svc->dev); + return svc; +} + +static inline void tb_service_put(struct tb_service *svc) +{ + if (svc) + put_device(&svc->dev); +} + +static inline bool tb_is_service(const struct device *dev) +{ + return dev->type == &tb_service_type; +} + +static inline struct tb_service *tb_to_service(struct device *dev) +{ + if (tb_is_service(dev)) + return container_of(dev, struct tb_service, dev); + return NULL; +} + +/** + * tb_service_driver - Thunderbolt service driver + * @driver: Driver structure + * @probe: Called when the driver is probed + * @remove: Called when the driver is removed (optional) + * @shutdown: Called at shutdown time to stop the service (optional) + * @id_table: Table of service identifiers the driver supports + */ +struct tb_service_driver { + struct device_driver driver; + int (*probe)(struct tb_service *svc, const struct tb_service_id *id); + void (*remove)(struct tb_service *svc); + void (*shutdown)(struct tb_service *svc); + const struct tb_service_id *id_table; +}; + +#define TB_SERVICE(key, id) \ + .match_flags = TBSVC_MATCH_PROTOCOL_KEY | \ + TBSVC_MATCH_PROTOCOL_ID, \ + .protocol_key = (key), \ + .protocol_id = (id) + +int tb_register_service_driver(struct tb_service_driver *drv); +void tb_unregister_service_driver(struct tb_service_driver *drv); + +static inline void *tb_service_get_drvdata(const struct tb_service *svc) +{ + return dev_get_drvdata(&svc->dev); +} + +static inline void tb_service_set_drvdata(struct tb_service *svc, void *data) +{ + dev_set_drvdata(&svc->dev, data); +} + +static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) +{ + return tb_to_xdomain(svc->dev.parent); +} + #endif /* THUNDERBOLT_H_ */ -- cgit v1.1 From 3b3d9f4da96493e4f68d0a80ab210763a24f8b33 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:37 +0300 Subject: thunderbolt: Export ring handling functions to modules These are used by Thunderbolt services to send and receive frames over the high-speed DMA rings. We also put the functions to tb_ namespace to make sure we do not collide with others and add missing kernel-doc comments for the exported functions. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 158 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 18c0e3d..9ddb83a 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -15,10 +15,12 @@ #define THUNDERBOLT_H_ #include +#include #include #include #include #include +#include enum tb_cfg_pkg_type { TB_CFG_PKG_READ = 1, @@ -397,4 +399,160 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) return tb_to_xdomain(svc->dev.parent); } +/** + * struct tb_nhi - thunderbolt native host interface + * @lock: Must be held during ring creation/destruction. Is acquired by + * interrupt_work when dispatching interrupts to individual rings. + * @pdev: Pointer to the PCI device + * @iobase: MMIO space of the NHI + * @tx_rings: All Tx rings available on this host controller + * @rx_rings: All Rx rings available on this host controller + * @msix_ida: Used to allocate MSI-X vectors for rings + * @going_away: The host controller device is about to disappear so when + * this flag is set, avoid touching the hardware anymore. + * @interrupt_work: Work scheduled to handle ring interrupt when no + * MSI-X is used. + * @hop_count: Number of rings (end point hops) supported by NHI. + */ +struct tb_nhi { + struct mutex lock; + struct pci_dev *pdev; + void __iomem *iobase; + struct tb_ring **tx_rings; + struct tb_ring **rx_rings; + struct ida msix_ida; + bool going_away; + struct work_struct interrupt_work; + u32 hop_count; +}; + +/** + * struct tb_ring - thunderbolt TX or RX ring associated with a NHI + * @lock: Lock serializing actions to this ring. Must be acquired after + * nhi->lock. + * @nhi: Pointer to the native host controller interface + * @size: Size of the ring + * @hop: Hop (DMA channel) associated with this ring + * @head: Head of the ring (write next descriptor here) + * @tail: Tail of the ring (complete next descriptor here) + * @descriptors: Allocated descriptors for this ring + * @queue: Queue holding frames to be transferred over this ring + * @in_flight: Queue holding frames that are currently in flight + * @work: Interrupt work structure + * @is_tx: Is the ring Tx or Rx + * @running: Is the ring running + * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise. + * @vector: MSI-X vector number the ring uses (only set if @irq is > 0) + * @flags: Ring specific flags + * @sof_mask: Bit mask used to detect start of frame PDF + * @eof_mask: Bit mask used to detect end of frame PDF + */ +struct tb_ring { + struct mutex lock; + struct tb_nhi *nhi; + int size; + int hop; + int head; + int tail; + struct ring_desc *descriptors; + dma_addr_t descriptors_dma; + struct list_head queue; + struct list_head in_flight; + struct work_struct work; + bool is_tx:1; + bool running:1; + int irq; + u8 vector; + unsigned int flags; + u16 sof_mask; + u16 eof_mask; +}; + +/* Leave ring interrupt enabled on suspend */ +#define RING_FLAG_NO_SUSPEND BIT(0) +/* Configure the ring to be in frame mode */ +#define RING_FLAG_FRAME BIT(1) +/* Enable end-to-end flow control */ +#define RING_FLAG_E2E BIT(2) + +struct ring_frame; +typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled); + +/** + * struct ring_frame - For use with ring_rx/ring_tx + * @buffer_phy: DMA mapped address of the frame + * @callback: Callback called when the frame is finished + * @list: Frame is linked to a queue using this + * @size: Size of the frame in bytes (%0 means %4096) + * @flags: Flags for the frame (see &enum ring_desc_flags) + * @eof: End of frame protocol defined field + * @sof: Start of frame protocol defined field + */ +struct ring_frame { + dma_addr_t buffer_phy; + ring_cb callback; + struct list_head list; + u32 size:12; + u32 flags:12; + u32 eof:4; + u32 sof:4; +}; + +/* Minimum size for ring_rx */ +#define TB_FRAME_SIZE 0x100 + +struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, + unsigned int flags); +struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, + unsigned int flags, u16 sof_mask, + u16 eof_mask); +void tb_ring_start(struct tb_ring *ring); +void tb_ring_stop(struct tb_ring *ring); +void tb_ring_free(struct tb_ring *ring); + +int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); + +/** + * tb_ring_rx() - enqueue a frame on an RX ring + * @ring: Ring to enqueue the frame + * @frame: Frame to enqueue + * + * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The + * buffer must contain at least %TB_FRAME_SIZE bytes. + * + * @frame->callback will be invoked with @frame->size, @frame->flags, + * @frame->eof, @frame->sof set once the frame has been received. + * + * If ring_stop() is called after the packet has been enqueued + * @frame->callback will be called with canceled set to true. + * + * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + */ +static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) +{ + WARN_ON(ring->is_tx); + return __tb_ring_enqueue(ring, frame); +} + +/** + * tb_ring_tx() - enqueue a frame on an TX ring + * @ring: Ring the enqueue the frame + * @frame: Frame to enqueue + * + * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size, + * @frame->eof and @frame->sof have to be set. + * + * @frame->callback will be invoked with once the frame has been transmitted. + * + * If ring_stop() is called after the packet has been enqueued @frame->callback + * will be called with canceled set to true. + * + * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + */ +static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) +{ + WARN_ON(!ring->is_tx); + return __tb_ring_enqueue(ring, frame); +} + #endif /* THUNDERBOLT_H_ */ -- cgit v1.1 From 2a91ec63f8a11e70d4b958dd4df867fec0247179 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:38 +0300 Subject: thunderbolt: Move ring descriptor flags to thunderbolt.h A Thunderbolt service driver might need to check if there was an error with the descriptor when in frame mode. We also add two Rx specific error flags RING_DESC_CRC_ERROR and RING_DESC_BUFFER_OVERRUN. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 9ddb83a..e3b9af7 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -479,6 +479,24 @@ struct ring_frame; typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled); /** + * enum ring_desc_flags - Flags for DMA ring descriptor + * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only) + * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only) + * %RING_DESC_COMPLETED: Descriptor completed (set by NHI) + * %RING_DESC_POSTED: Always set this + * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun + * %RING_DESC_INTERRUPT: Request an interrupt on completion + */ +enum ring_desc_flags { + RING_DESC_ISOCH = 0x1, + RING_DESC_CRC_ERROR = 0x1, + RING_DESC_COMPLETED = 0x2, + RING_DESC_POSTED = 0x4, + RING_DESC_BUFFER_OVERRUN = 0x04, + RING_DESC_INTERRUPT = 0x8, +}; + +/** * struct ring_frame - For use with ring_rx/ring_tx * @buffer_phy: DMA mapped address of the frame * @callback: Callback called when the frame is finished -- cgit v1.1 From 22b7de1000e66d739c431d6be4e7e97c69fa7c98 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:39 +0300 Subject: thunderbolt: Use spinlock in ring serialization This makes it possible to enqueue frames also from atomic context which is needed for example, when networking packets are sent over a Thunderbolt cable. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index e3b9af7..cf9e42d 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -448,7 +448,7 @@ struct tb_nhi { * @eof_mask: Bit mask used to detect end of frame PDF */ struct tb_ring { - struct mutex lock; + spinlock_t lock; struct tb_nhi *nhi; int size; int hop; -- cgit v1.1 From 59120e06101db72442acf4c8b364a0c76d8faa68 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:40 +0300 Subject: thunderbolt: Use spinlock in NHI serialization This is needed because ring polling functionality can be called from atomic contexts when networking and other high-speed traffic is transferred over a Thunderbolt cable. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index cf9e42d..d59e3f9 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -415,7 +415,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) * @hop_count: Number of rings (end point hops) supported by NHI. */ struct tb_nhi { - struct mutex lock; + spinlock_t lock; struct pci_dev *pdev; void __iomem *iobase; struct tb_ring **tx_rings; -- cgit v1.1 From 4ffe722eefcb07c76701f03e0d759fbaecedf79f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:41 +0300 Subject: thunderbolt: Add polling mode for rings In order to support things like networking over Thunderbolt cable, there needs to be a way to switch the ring to a mode where it can be polled with the interrupt masked. We implement such mode so that the caller can allocate a ring by passing pointer to a function that is then called when an interrupt is triggered. Completed frames can be fetched using tb_ring_poll() and the interrupt can be re-enabled when the caller is finished with polling by using tb_ring_poll_complete(). Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index d59e3f9..36925e3 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -446,6 +446,9 @@ struct tb_nhi { * @flags: Ring specific flags * @sof_mask: Bit mask used to detect start of frame PDF * @eof_mask: Bit mask used to detect end of frame PDF + * @start_poll: Called when ring interrupt is triggered to start + * polling. Passing %NULL keeps the ring in interrupt mode. + * @poll_data: Data passed to @start_poll */ struct tb_ring { spinlock_t lock; @@ -466,6 +469,8 @@ struct tb_ring { unsigned int flags; u16 sof_mask; u16 eof_mask; + void (*start_poll)(void *data); + void *poll_data; }; /* Leave ring interrupt enabled on suspend */ @@ -499,7 +504,7 @@ enum ring_desc_flags { /** * struct ring_frame - For use with ring_rx/ring_tx * @buffer_phy: DMA mapped address of the frame - * @callback: Callback called when the frame is finished + * @callback: Callback called when the frame is finished (optional) * @list: Frame is linked to a queue using this * @size: Size of the frame in bytes (%0 means %4096) * @flags: Flags for the frame (see &enum ring_desc_flags) @@ -522,8 +527,8 @@ struct ring_frame { struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, unsigned int flags); struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags, u16 sof_mask, - u16 eof_mask); + unsigned int flags, u16 sof_mask, u16 eof_mask, + void (*start_poll)(void *), void *poll_data); void tb_ring_start(struct tb_ring *ring); void tb_ring_stop(struct tb_ring *ring); void tb_ring_free(struct tb_ring *ring); @@ -535,8 +540,8 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); * @ring: Ring to enqueue the frame * @frame: Frame to enqueue * - * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The - * buffer must contain at least %TB_FRAME_SIZE bytes. + * @frame->buffer, @frame->buffer_phy have to be set. The buffer must + * contain at least %TB_FRAME_SIZE bytes. * * @frame->callback will be invoked with @frame->size, @frame->flags, * @frame->eof, @frame->sof set once the frame has been received. @@ -557,8 +562,8 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) * @ring: Ring the enqueue the frame * @frame: Frame to enqueue * - * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size, - * @frame->eof and @frame->sof have to be set. + * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and + * @frame->sof have to be set. * * @frame->callback will be invoked with once the frame has been transmitted. * @@ -573,4 +578,8 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) return __tb_ring_enqueue(ring, frame); } +/* Used only when the ring is in polling mode */ +struct ring_frame *tb_ring_poll(struct tb_ring *ring); +void tb_ring_poll_complete(struct tb_ring *ring); + #endif /* THUNDERBOLT_H_ */ -- cgit v1.1 From 3304559e353f098d7e0ed5ca981e26c406513e12 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:42 +0300 Subject: thunderbolt: Add function to retrieve DMA device for the ring This is needed when Thunderbolt service drivers need to DMA map memory before it is passed down to the ring. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 36925e3..7b69853 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -582,4 +583,16 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) struct ring_frame *tb_ring_poll(struct tb_ring *ring); void tb_ring_poll_complete(struct tb_ring *ring); +/** + * tb_ring_dma_device() - Return device used for DMA mapping + * @ring: Ring whose DMA device is retrieved + * + * Use this function when you are mapping DMA for buffers that are + * passed to the ring for sending/receiving. + */ +static inline struct device *tb_ring_dma_device(struct tb_ring *ring) +{ + return &ring->nhi->pdev->dev; +} + #endif /* THUNDERBOLT_H_ */ -- cgit v1.1 From abf4bb6b63d0a54266f8e7eff3720c1974063971 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:06 +0200 Subject: skbuff: Add the offload_mr_fwd_mark field Similarly to the offload_fwd_mark field, the offload_mr_fwd_mark field is used to allow partial offloading of MFC multicast routes. Switchdev drivers can offload MFC multicast routes to the hardware by registering to the FIB notification chain. When one of the route output interfaces is not offload-able, i.e. has different parent ID, the route cannot be fully offloaded by the hardware. Examples to non-offload-able devices are a management NIC, dummy device, pimreg device, etc. Similar problem exists in the bridge module, as one bridge can hold interfaces with different parent IDs. At the bridge, the problem is solved by the offload_fwd_mark skb field. Currently, when a route cannot go through full offload, the only solution for a switchdev driver is not to offload it at all and let the packet go through slow path. Using the offload_mr_fwd_mark field, a driver can indicate that a packet was already forwarded by hardware to all the devices with the same parent ID as the input device. Further patches in this patch-set are going to enhance ipmr to skip multicast forwarding to devices with the same parent ID if a packets is marked with that field. The reason why the already existing "offload_fwd_mark" bit cannot be used is that a switchdev driver would want to make the distinction between a packet that has already gone through L2 forwarding but did not go through multicast forwarding, and a packet that has already gone through both L2 and multicast forwarding. For example: when a packet is ingressing from a switchport enslaved to a bridge, which is configured with multicast forwarding, the following scenarios are possible: - The packet can be trapped to the CPU due to exception while multicast forwarding (for example, MTU error). In that case, it had already gone through L2 forwarding in the hardware, thus A switchdev driver would want to set the skb->offload_fwd_mark and not the skb->offload_mr_fwd_mark. - The packet can also be trapped due to a pimreg/dummy device used as one of the output interfaces. In that case, it can go through both L2 and (partial) multicast forwarding inside the hardware, thus a switchdev driver would want to set both the skb->offload_fwd_mark and skb->offload_mr_fwd_mark. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 19e64bf..ada8214 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -772,6 +772,7 @@ struct sk_buff { __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; + __u8 offload_mr_fwd_mark:1; #endif #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; -- cgit v1.1 From 5d8b3e69fc5e5ccafc9db1251bb7c78a8622fddd Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:07 +0200 Subject: ipv4: ipmr: Add the parent ID field to VIF struct In order to allow the ipmr module to do partial multicast forwarding according to the device parent ID, add the device parent ID field to the VIF struct. This way, the forwarding path can use the parent ID field without invoking switchdev calls, which requires the RTNL lock. When a new VIF is added, set the device parent ID field in it by invoking the switchdev_port_attr_get call. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index b072a84..8242d05 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -57,6 +57,7 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) struct vif_device { struct net_device *dev; /* Device we are using */ + struct netdev_phys_item_id dev_parent_id; /* Device parent ID */ unsigned long bytes_in,bytes_out; unsigned long pkt_in,pkt_out; /* Statistics */ unsigned long rate_limit; /* Traffic shaping (NI) */ -- cgit v1.1 From 6c5570016b972d9b1f0f6c2dca9cc0422b1f92bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 Oct 2017 23:50:05 +0200 Subject: net: core: decouple ifalias get/set from rtnl lock Device alias can be set by either rtnetlink (rtnl is held) or sysfs. rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose. Add an extra mutex for it and use rcu to protect concurrent accesses. This allows the sysfs path to not take rtnl and would later allow to not hold it when dumping ifalias. Based on suggestion from Eric Dumazet. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e1d6ef1..d04424c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -826,6 +826,11 @@ struct xfrmdev_ops { }; #endif +struct dev_ifalias { + struct rcu_head rcuhead; + char ifalias[]; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1632,7 +1637,7 @@ enum netdev_priv_flags { struct net_device { char name[IFNAMSIZ]; struct hlist_node name_hlist; - char *ifalias; + struct dev_ifalias __rcu *ifalias; /* * I/O specific fields * FIXME: Merge these and struct ifmap into one @@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); +int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); int dev_set_mtu(struct net_device *, int); -- cgit v1.1 From f952be79cebd49d04154781d99408867a069d375 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:11 -0300 Subject: sctp: introduce struct sctp_stream_out_ext With the stream schedulers, sctp_stream_out will become too big to be allocated by kmalloc and as we need to allocate with BH disabled, we cannot use __vmalloc in sctp_stream_init(). This patch moves out the stats from sctp_stream_out to sctp_stream_out_ext, which will be allocated only when the application tries to sendmsg something on it. Just the introduction of sctp_stream_out_ext would already fix the issue described above by splitting the allocation in two. Moving the stats to it also reduces the pressure on the allocator as we will ask for less memory atomically when creating the socket and we will use GFP_KERNEL later. Then, for stream schedulers, we will just use sctp_stream_out_ext. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 0477945..9b2b30b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -84,6 +84,7 @@ struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; +struct sctp_stream_out; #include @@ -380,6 +381,7 @@ struct sctp_sender_hb_info { int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp); +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); @@ -1315,11 +1317,15 @@ struct sctp_inithdr_host { __u32 initial_tsn; }; +struct sctp_stream_out_ext { + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; + __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; +}; + struct sctp_stream_out { __u16 ssn; __u8 state; - __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; - __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + struct sctp_stream_out_ext *ext; }; struct sctp_stream_in { -- cgit v1.1 From 2fc019f790312e703efa1a44204c586112a430dc Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:12 -0300 Subject: sctp: introduce sctp_chunk_stream_no Add a helper to fetch the stream number from a given chunk. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 9b2b30b..c48f799 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -642,6 +642,11 @@ void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); +static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch) +{ + return ntohs(ch->subh.data_hdr->stream); +} + enum { SCTP_ADDR_NEW, /* new address added to assoc/ep */ SCTP_ADDR_SRC, /* address can be used as source */ -- cgit v1.1 From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:13 -0300 Subject: sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_sched.h | 72 +++++++++++++++++++++++++++++++++++++++++ include/net/sctp/structs.h | 15 +++++++-- include/uapi/linux/sctp.h | 6 ++++ 3 files changed, 90 insertions(+), 3 deletions(-) create mode 100644 include/net/sctp/stream_sched.h (limited to 'include') diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h new file mode 100644 index 0000000..c676550 --- /dev/null +++ b/include/net/sctp/stream_sched.h @@ -0,0 +1,72 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * These are definitions used by the stream schedulers, defined in RFC + * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11) + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresses: + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#ifndef __sctp_stream_sched_h__ +#define __sctp_stream_sched_h__ + +struct sctp_sched_ops { + /* Property handling for a given stream */ + int (*set)(struct sctp_stream *stream, __u16 sid, __u16 value, + gfp_t gfp); + int (*get)(struct sctp_stream *stream, __u16 sid, __u16 *value); + + /* Init the specific scheduler */ + int (*init)(struct sctp_stream *stream); + /* Init a stream */ + int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp); + /* Frees the entire thing */ + void (*free)(struct sctp_stream *stream); + + /* Enqueue a chunk */ + void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg); + /* Dequeue a chunk */ + struct sctp_chunk *(*dequeue)(struct sctp_outq *q); + /* Called only if the chunk fit the packet */ + void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk); + /* Sched all chunks already enqueued */ + void (*sched_all)(struct sctp_stream *steam); + /* Unched all chunks already enqueued */ + void (*unsched_all)(struct sctp_stream *steam); +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched); +int sctp_sched_get_sched(struct sctp_association *asoc); +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp); +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value); +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch); + +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); + +#endif /* __sctp_stream_sched_h__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index c48f799..3c22a30 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -84,7 +84,6 @@ struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; -struct sctp_stream_out; #include @@ -531,8 +530,12 @@ struct sctp_chunk { /* How many times this chunk have been sent, for prsctp RTX policy */ int sent_count; - /* This is our link to the per-transport transmitted list. */ - struct list_head transmitted_list; + union { + /* This is our link to the per-transport transmitted list. */ + struct list_head transmitted_list; + /* List in specific stream outq */ + struct list_head stream_list; + }; /* This field is used by chunks that hold fragmented data. * For the first fragment this is the list that holds the rest of @@ -1019,6 +1022,9 @@ struct sctp_outq { /* Data pending that has never been transmitted. */ struct list_head out_chunk_list; + /* Stream scheduler being used */ + struct sctp_sched_ops *sched; + unsigned int out_qlen; /* Total length of queued data chunks. */ /* Error of send failed, may used in SCTP_SEND_FAILED event. */ @@ -1325,6 +1331,7 @@ struct sctp_inithdr_host { struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + struct list_head outq; /* chunks enqueued by this stream */ }; struct sctp_stream_out { @@ -1342,6 +1349,8 @@ struct sctp_stream { struct sctp_stream_in *in; __u16 outcnt; __u16 incnt; + /* Current stream being sent, if any */ + struct sctp_stream_out *out_curr; }; #define SCTP_STREAM_CLOSED 0x00 diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6217ff8..4487e76 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1088,4 +1088,10 @@ struct sctp_add_streams { uint16_t sas_outstrms; }; +/* SCTP Stream schedulers */ +enum sctp_sched_type { + SCTP_SS_FCFS, + SCTP_SS_MAX = SCTP_SS_FCFS +}; + #endif /* _UAPI_SCTP_H */ -- cgit v1.1 From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:14 -0300 Subject: sctp: add sockopt to get/set stream scheduler As defined per RFC Draft ndata Section 4.3.2, named as SCTP_STREAM_SCHEDULER. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 4487e76..0050f10 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -122,6 +122,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RESET_ASSOC 120 #define SCTP_ADD_STREAMS 121 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 +#define SCTP_STREAM_SCHEDULER 123 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 -- cgit v1.1 From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:15 -0300 Subject: sctp: add sockopt to get/set stream scheduler parameters As defined per RFC Draft ndata Section 4.3.3, named as SCTP_STREAM_SCHEDULER_VALUE. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 0050f10..00ac417 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -123,6 +123,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_ADD_STREAMS 121 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 #define SCTP_STREAM_SCHEDULER 123 +#define SCTP_STREAM_SCHEDULER_VALUE 124 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -815,6 +816,12 @@ struct sctp_assoc_value { uint32_t assoc_value; }; +struct sctp_stream_value { + sctp_assoc_t assoc_id; + uint16_t stream_id; + uint16_t stream_value; +}; + /* * 7.2.2 Peer Address Information * -- cgit v1.1 From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:16 -0300 Subject: sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 24 ++++++++++++++++++++++++ include/uapi/linux/sctp.h | 3 ++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 3c22a30..40eb8d6 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1328,10 +1328,27 @@ struct sctp_inithdr_host { __u32 initial_tsn; }; +struct sctp_stream_priorities { + /* List of priorities scheduled */ + struct list_head prio_sched; + /* List of streams scheduled */ + struct list_head active; + /* The next stream stream in line */ + struct sctp_stream_out_ext *next; + __u16 prio; +}; + struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; struct list_head outq; /* chunks enqueued by this stream */ + union { + struct { + /* Scheduled streams list */ + struct list_head prio_list; + struct sctp_stream_priorities *prio_head; + }; + }; }; struct sctp_stream_out { @@ -1351,6 +1368,13 @@ struct sctp_stream { __u16 incnt; /* Current stream being sent, if any */ struct sctp_stream_out *out_curr; + union { + /* Fields used by priority scheduler */ + struct { + /* List of priorities scheduled */ + struct list_head prio_list; + }; + }; }; #define SCTP_STREAM_CLOSED 0x00 diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 00ac417..850fa8b 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1099,7 +1099,8 @@ struct sctp_add_streams { /* SCTP Stream schedulers */ enum sctp_sched_type { SCTP_SS_FCFS, - SCTP_SS_MAX = SCTP_SS_FCFS + SCTP_SS_PRIO, + SCTP_SS_MAX = SCTP_SS_PRIO }; #endif /* _UAPI_SCTP_H */ -- cgit v1.1 From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:17 -0300 Subject: sctp: introduce round robin stream scheduler This patch introduces RFC Draft ndata section 3.2 Priority Based Scheduler (SCTP_SS_RR). Works by maintaining a list of enqueued streams and tracking the last one used to send data. When the datamsg is done, it switches to the next stream. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 11 +++++++++++ include/uapi/linux/sctp.h | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 40eb8d6..16f949e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1348,6 +1348,10 @@ struct sctp_stream_out_ext { struct list_head prio_list; struct sctp_stream_priorities *prio_head; }; + /* Fields used by RR scheduler */ + struct { + struct list_head rr_list; + }; }; }; @@ -1374,6 +1378,13 @@ struct sctp_stream { /* List of priorities scheduled */ struct list_head prio_list; }; + /* Fields used by RR scheduler */ + struct { + /* List of streams scheduled */ + struct list_head rr_list; + /* The next stream stream in line */ + struct sctp_stream_out_ext *rr_next; + }; }; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 850fa8b..6cd7d41 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1100,7 +1100,8 @@ struct sctp_add_streams { enum sctp_sched_type { SCTP_SS_FCFS, SCTP_SS_PRIO, - SCTP_SS_MAX = SCTP_SS_PRIO + SCTP_SS_RR, + SCTP_SS_MAX = SCTP_SS_RR }; #endif /* _UAPI_SCTP_H */ -- cgit v1.1 From e774d96b7d2c3489bfb5bbdc2b65ed41cd68d3d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:55:29 +0200 Subject: rtnetlink: remove slave_validate callback no users in the tree. Signed-off-by: Florian Westphal Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 21837ca..6520993 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -93,9 +93,6 @@ struct rtnl_link_ops { int slave_maxtype; const struct nla_policy *slave_policy; - int (*slave_validate)(struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack); int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, struct nlattr *tb[], -- cgit v1.1 From 5c45121dc39026ab2139910e57cf933fd57d30f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:58:49 +0200 Subject: rtnetlink: remove __rtnl_af_unregister switch the only caller to rtnl_af_unregister. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 6520993..e3ca8e2 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -151,8 +151,6 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void __rtnl_af_unregister(struct rtnl_af_ops *ops); - void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); -- cgit v1.1 From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:21 -0700 Subject: bpf: multi program support for cgroup+bpf introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple bpf programs to a cgroup. The difference between three possible flags for BPF_PROG_ATTACH command: - NONE(default): No further bpf programs allowed in the subtree. - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, the program in this cgroup yields to sub-cgroup program. - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, that cgroup program gets run in addition to the program in this cgroup. NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't change their behavior. It only clarifies the semantics in relation to new flag. Only one program is allowed to be attached to a cgroup with NONE or BPF_F_ALLOW_OVERRIDE flag. Multiple programs are allowed to be attached to a cgroup with BPF_F_ALLOW_MULTI flag. They are executed in FIFO order (those that were attached first, run first) The programs of sub-cgroup are executed first, then programs of this cgroup and then programs of parent cgroup. All eligible programs are executed regardless of return code from earlier programs. To allow efficient execution of multiple programs attached to a cgroup and to avoid penalizing cgroups without any programs attached introduce 'struct bpf_prog_array' which is RCU protected array of pointers to bpf programs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 46 ++++++++++++++++++++++++++++++---------------- include/linux/bpf.h | 32 ++++++++++++++++++++++++++++++++ include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 42 +++++++++++++++++++++++++++++++++++++++--- 4 files changed, 102 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d41d40a..102e56f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -14,27 +14,42 @@ struct bpf_sock_ops_kern; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_prog_list { + struct list_head node; + struct bpf_prog *prog; +}; + +struct bpf_prog_array; + struct cgroup_bpf { - /* - * Store two sets of bpf_prog pointers, one for programs that are - * pinned directly to this cgroup, and one for those that are effective - * when this cgroup is accessed. + /* array of effective progs in this cgroup */ + struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + + /* attached progs to this cgroup and attach flags + * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will + * have either zero or one element + * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; - struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE]; - bool disallow_override[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_BPF_ATTACH_TYPE]; + u32 flags[MAX_BPF_ATTACH_TYPE]; + + /* temp storage for effective prog array used by prog_attach/detach */ + struct bpf_prog_array __rcu *inactive; }; void cgroup_bpf_put(struct cgroup *cgrp); -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); +int cgroup_bpf_inherit(struct cgroup *cgrp); -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool overridable); +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); -/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable); +/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} -static inline void cgroup_bpf_inherit(struct cgroup *cgrp, - struct cgroup *parent) {} +static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 252f4bc..a6964b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +/* an array of programs to be executed under rcu_lock. + * + * Typical usage: + * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN); + * + * the structure returned by bpf_prog_array_alloc() should be populated + * with program pointers and the last pointer must be NULL. + * The user has to keep refcnt on the program and make sure the program + * is removed from the array before bpf_prog_put(). + * The 'struct bpf_prog_array *' should only be replaced with xchg() + * since other cpus are walking the array of pointers in parallel. + */ +struct bpf_prog_array { + struct rcu_head rcu; + struct bpf_prog *progs[0]; +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); + +#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog **_prog; \ + u32 _ret = 1; \ + rcu_read_lock(); \ + _prog = rcu_dereference(array)->progs; \ + for (; *_prog; _prog++) \ + _ret &= func(*_prog, ctx); \ + rcu_read_unlock(); \ + _ret; \ + }) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); diff --git a/include/linux/filter.h b/include/linux/filter.h index 911d454..2d2db39 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -481,7 +481,7 @@ struct sk_filter { struct bpf_prog *prog; }; -#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) +#define BPF_PROG_RUN(filter, ctx) (*(filter)->bpf_func)(ctx, (filter)->insnsi) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6d2137b..762f74b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,11 +143,47 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command - * to the given target_fd cgroup the descendent cgroup will be able to - * override effective bpf program that was inherited from this cgroup +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel -- cgit v1.1 From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 4 ++++ include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 13 +++++++++++++ 3 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 102e56f..359b6f5 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6964b7..a67daea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -260,6 +260,9 @@ struct bpf_prog_array { struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt); #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ ({ \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 762f74b..cb2b9f9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, }; enum bpf_map_type { @@ -211,6 +212,9 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + #define BPF_OBJ_NAME_LEN 16U union bpf_attr { @@ -289,6 +293,15 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + } query; } __attribute__((aligned(8))); /* BPF helper function descriptions: -- cgit v1.1 From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 3 Oct 2017 13:53:23 +0200 Subject: dev: advertise the new nsid when the netns iface changes x-netns interfaces are bound to two netns: the link netns and the upper netns. Usually, this kind of interfaces is created in the link netns and then moved to the upper netns. At the end, the interface is visible only in the upper netns. The link nsid is advertised via netlink in the upper netns, thus the user always knows where is the link part. There is no such mechanism in the link netns. When the interface is moved to another netns, the user cannot "follow" it. This patch adds a new netlink attribute which helps to follow an interface which moves to another netns. When the interface is unregistered, the new nsid is advertised. If the interface is a x-netns interface (ie rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed. CC: Jason A. Donenfeld Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 4 +++- include/uapi/linux/if_link.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index dea59c8..1251638 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, - gfp_t flags); + gfp_t flags, int *new_nsid); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ea87bd7..cd580fc 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -158,6 +158,7 @@ enum { IFLA_PAD, IFLA_XDP, IFLA_EVENT, + IFLA_NEW_NETNSID, __IFLA_MAX }; -- cgit v1.1 From 51d0c04795a4b5d9a188336884887a9d394a94b0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:45 -0700 Subject: net: Add extack to netdev_notifier_info Add netlink_ext_ack to netdev_notifier_info to allow notifier handlers to return errors to userspace. Clean up the initialization in dev.c such that extack is easily added in subsequent patches where relevant. Specifically, remove the init call in call_netdevice_notifiers_info and have callers initalize on stack when info is declared. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d04424c..05fcaba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2309,7 +2309,8 @@ int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); struct netdev_notifier_info { - struct net_device *dev; + struct net_device *dev; + struct netlink_ext_ack *extack; }; struct netdev_notifier_change_info { @@ -2334,6 +2335,7 @@ static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { info->dev = dev; + info->extack = NULL; } static inline struct net_device * @@ -2342,6 +2344,12 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) return info->dev; } +static inline struct netlink_ext_ack * +netdev_notifier_info_to_extack(const struct netdev_notifier_info *info) +{ + return info->extack; +} + int call_netdevice_notifiers(unsigned long val, struct net_device *dev); -- cgit v1.1 From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:46 -0700 Subject: net: Add extack to ndo_add_slave Pass extack to do_set_master and down to ndo_add_slave Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- include/net/bonding.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 05fcaba..368a506 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1246,7 +1246,8 @@ struct net_device_ops { u32 flow_id); #endif int (*ndo_add_slave)(struct net_device *dev, - struct net_device *slave_dev); + struct net_device *slave_dev, + struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); netdev_features_t (*ndo_fix_features)(struct net_device *dev, diff --git a/include/net/bonding.h b/include/net/bonding.h index b2e6865..2860cc6 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -596,7 +596,8 @@ void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); -int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev); +int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); -- cgit v1.1 From 42ab19ee90292993370a30ad242599d75a3b749e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:47 -0700 Subject: net: Add extack to upper device linking Add extack arg to netdev_upper_dev_link and netdev_master_upper_dev_link Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 3 ++- include/linux/netdevice.h | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index c9ec134..10e319f 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -72,7 +72,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, extern void macvlan_common_setup(struct net_device *dev); extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]); + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack); extern void macvlan_count_rx(const struct macvlan_dev *vlan, unsigned int len, bool success, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 368a506..31bb301 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3919,10 +3919,12 @@ void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); -int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); +int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, + struct netlink_ext_ack *extack); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv, void *upper_info); + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.1 From 44f209807ee87a5eddf6c0432f3fb63cec27bad8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:50 -0400 Subject: VSOCK: export socket tables for sock_diag interface The socket table symbols need to be exported from vsock.ko so that the vsock_diag.ko module will be able to traverse sockets. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f9fb566..30cba80 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -27,6 +27,11 @@ #define LAST_RESERVED_PORT 1023 +#define VSOCK_HASH_SIZE 251 +extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +extern spinlock_t vsock_table_lock; + #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) -- cgit v1.1 From bf359b8127719535f88494adb3c2b73c06667dcd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:51 -0400 Subject: VSOCK: move __vsock_in_bound/connected_table() to af_vsock.h The vsock_diag.ko module will need to check socket table membership. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 30cba80..3dd2177 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -180,6 +180,18 @@ const struct vsock_transport *vsock_core_get_transport(void); /**** UTILS ****/ +/* vsock_table_lock must be held */ +static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->bound_table); +} + +/* vsock_table_lock must be held */ +static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->connected_table); +} + void vsock_release_pending(struct sock *pending); void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); -- cgit v1.1 From 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:52 -0400 Subject: VSOCK: use TCP state constants for sk_state There are two state fields: socket->state and sock->sk_state. The socket->state field uses SS_UNCONNECTED, SS_CONNECTED, etc while the sock->sk_state typically uses values that match TCP state constants (TCP_CLOSE, TCP_ESTABLISHED). AF_VSOCK does not follow this convention and instead uses SS_* constants for both fields. The sk_state field will be exposed to userspace through the vsock_diag interface for ss(8), netstat(8), and other programs. This patch switches sk_state to TCP state constants so that the meaning of this field is consistent with other address families. Not just AF_INET and AF_INET6 use the TCP constants, AF_UNIX and others do too. The following mapping was used to convert the code: SS_FREE -> TCP_CLOSE SS_UNCONNECTED -> TCP_CLOSE SS_CONNECTING -> TCP_SYN_SENT SS_CONNECTED -> TCP_ESTABLISHED SS_DISCONNECTING -> TCP_CLOSING VSOCK_SS_LISTEN -> TCP_LISTEN In __vsock_create() the sk_state initialization was dropped because sock_init_data() already initializes sk_state to TCP_CLOSE. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 3dd2177..9324ac2 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -22,9 +22,6 @@ #include "vsock_addr.h" -/* vsock-specific sock->sk_state constants */ -#define VSOCK_SS_LISTEN 255 - #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 -- cgit v1.1 From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:53 -0400 Subject: VSOCK: add sock_diag interface This patch adds the sock_diag interface for querying sockets from userspace. Tools like ss(8) and netstat(8) can use this interface to list open sockets. The userspace ABI is defined in and includes netlink request and response structs. The request can query sockets based on their sk_state (e.g. listening sockets only) and the response contains socket information fields including the local/remote addresses, inode number, etc. This patch does not dump VMCI pending sockets because I have only tested the virtio transport, which does not use pending sockets. Support can be added later by extending vsock_diag_dump() if needed by VMCI users. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/uapi/linux/vm_sockets_diag.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/uapi/linux/vm_sockets_diag.h (limited to 'include') diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h new file mode 100644 index 0000000..14cd7dc --- /dev/null +++ b/include/uapi/linux/vm_sockets_diag.h @@ -0,0 +1,33 @@ +/* AF_VSOCK sock_diag(7) interface for querying open sockets */ + +#ifndef _UAPI__VM_SOCKETS_DIAG_H__ +#define _UAPI__VM_SOCKETS_DIAG_H__ + +#include + +/* Request */ +struct vsock_diag_req { + __u8 sdiag_family; /* must be AF_VSOCK */ + __u8 sdiag_protocol; /* must be 0 */ + __u16 pad; /* must be 0 */ + __u32 vdiag_states; /* query bitmap (e.g. 1 << TCP_LISTEN) */ + __u32 vdiag_ino; /* must be 0 (reserved) */ + __u32 vdiag_show; /* must be 0 (reserved) */ + __u32 vdiag_cookie[2]; +}; + +/* Response */ +struct vsock_diag_msg { + __u8 vdiag_family; /* AF_VSOCK */ + __u8 vdiag_type; /* SOCK_STREAM or SOCK_DGRAM */ + __u8 vdiag_state; /* sk_state (e.g. TCP_LISTEN) */ + __u8 vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */ + __u32 vdiag_src_cid; + __u32 vdiag_src_port; + __u32 vdiag_dst_cid; + __u32 vdiag_dst_port; + __u32 vdiag_ino; + __u32 vdiag_cookie[2]; +}; + +#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */ -- cgit v1.1 From 27204aaa9dc67b833b77179fdac556288ec3a4bf Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:03:44 -0700 Subject: tcp: uniform the set up of sockets after successful connection Currently in the TCP code, the initialization sequence for cached metrics, congestion control, BPF, etc, after successful connection is very inconsistent. This introduces inconsistent bevhavior and is prone to bugs. The current call sequence is as follows: (1) for active case (tcp_finish_connect() case): tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); (2) for passive case (tcp_rcv_state_process() TCP_SYN_RECV case): icsk->icsk_af_ops->rebuild_header(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); tcp_init_buffer_space(sk); tcp_init_metrics(sk); (3) for TFO passive case (tcp_fastopen_create_child()): inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); This commit uniforms the above functions to have the following sequence: tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE/PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); This sequence is the same as the (1) active case. We pick this sequence because this order correctly allows BPF to override the settings including congestion control module and initial cwnd, etc from the route, and then allows the CC module to see those settings. Suggested-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7a3a8af..426c2e9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -416,6 +416,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); +void tcp_init_transfer(struct sock *sk, int bpf_op); unsigned int tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, -- cgit v1.1 From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Oct 2017 12:59:58 -0700 Subject: tcp: new list for sent but unacked skbs for RACK recovery This patch adds a new queue (list) that tracks the sent but not yet acked or SACKed skbs for a TCP connection. The list is chronologically ordered by skb->skb_mstamp (the head is the oldest sent skb). This list will be used to optimize TCP Rack recovery, which checks an skb's timestamp to judge if it has been lost and needs to be retransmitted. Since TCP write queue is ordered by sequence instead of sent time, RACK has to scan over the write queue to catch all eligible packets to detect lost retransmission, and iterates through SACKed skbs repeatedly. Special cares for rare events: 1. TCP repair fakes skb transmission so the send queue needs adjusted 2. SACK reneging would require re-inserting SACKed skbs into the send queue. For now I believe it's not worth the complexity to make RACK work perfectly on SACK reneging, so we do nothing here. 3. Fast Open: currently for non-TFO, send-queue correctly queues the pure SYN packet. For TFO which queues a pure SYN and then a data packet, send-queue only queues the data packet but not the pure SYN due to the structure of TFO code. This is okay because the SYN receiver would never respond with a SACK on a missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK). In order to not grow sk_buff, we use an union for the new list and _skb_refdst/destructor fields. This is a bit complicated because we need to make sure _skb_refdst and destructor are properly zeroed before skb is cloned/copied at transmit, and before being freed. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++-- include/linux/tcp.h | 1 + include/net/tcp.h | 24 +++++++++++++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ada8214..01a9859 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t; * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function + * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on @@ -686,8 +687,14 @@ struct sk_buff { */ char cb[48] __aligned(8); - unsigned long _skb_refdst; - void (*destructor)(struct sk_buff *skb); + union { + struct { + unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); + }; + struct list_head tcp_tsorted_anchor; + }; + #ifdef CONFIG_XFRM struct sec_path *sp; #endif diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4aa40ef..1d2c44e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -191,6 +191,7 @@ struct tcp_sock { u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ + struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 426c2e9..3b16f35 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1589,14 +1589,34 @@ enum tcp_chrono { void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); +/* This helper is needed, because skb->tcp_tsorted_anchor uses + * the same memory storage than skb->destructor/_skb_refdst + */ +static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) +{ + skb->destructor = NULL; + skb->_skb_refdst = 0UL; +} + +#define tcp_skb_tsorted_save(skb) { \ + unsigned long _save = skb->_skb_refdst; \ + skb->_skb_refdst = 0UL; + +#define tcp_skb_tsorted_restore(skb) \ + skb->_skb_refdst = _save; \ +} + /* write queue abstraction */ static inline void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); sk_wmem_free_skb(sk, skb); + } + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); } @@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new, static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { + list_del(&skb->tcp_tsorted_anchor); + tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -- cgit v1.1 From 4e64b1ed15e25b8dcc2819c6d43dab72eb0bea26 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Oct 2017 23:46:14 -0700 Subject: net/ipv6: Convert icmpv6_push_pending_frames to void commit cc71b7b07119 ("net/ipv6: remove unused err variable on icmpv6_push_pending_frames") exposed icmpv6_push_pending_frames return value not being used. Remove now unnecessary int err declarations and uses. Miscellanea: o Remove unnecessary goto and out: labels o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/ipv6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6eac5cf..3cda3b5 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len); +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); -- cgit v1.1 From 18a4c0eab2623cc95be98a1e6af1ad18e7695977 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:21 -0700 Subject: net: add rb_to_skb() and other rb tree helpers Geeralize private netem_rb_to_skb() TCP rtx queue will soon be converted to rb-tree, so we will need skb_rbtree_walk() helpers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 01a9859..03634ec2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) return __skb_grow(skb, len); } +#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) +#define skb_rb_first(root) rb_to_skb(rb_first(root)) +#define skb_rb_last(root) rb_to_skb(rb_last(root)) +#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) +#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ @@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) +#define skb_rbtree_walk(skb, root) \ + for (skb = skb_rb_first(root); skb != NULL; \ + skb = skb_rb_next(skb)) + +#define skb_rbtree_walk_from(skb) \ + for (; skb != NULL; \ + skb = skb_rb_next(skb)) + +#define skb_rbtree_walk_from_safe(skb, tmp) \ + for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ + skb = tmp) + #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ -- cgit v1.1 From ac3f09ba3e496bd7cc780ead05b1d1bb5f33aedb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:22 -0700 Subject: tcp: uninline tcp_write_queue_purge() Since the upcoming rtx rbtree will add some extra code, it is time to not inline this fat function anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3b16f35..744559b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1606,20 +1606,7 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) skb->_skb_refdst = _save; \ } -/* write queue abstraction */ -static inline void tcp_write_queue_purge(struct sock *sk) -{ - struct sk_buff *skb; - - tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - tcp_skb_tsorted_anchor_cleanup(skb); - sk_wmem_free_skb(sk, skb); - } - INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); - sk_mem_reclaim(sk); - tcp_clear_all_retrans_hints(tcp_sk(sk)); -} +void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { -- cgit v1.1 From 75c119afe14f74b4dd967d75ed9f57ab6c0ef045 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:27 -0700 Subject: tcp: implement rb-tree based retransmit queue Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++-- include/net/tcp.h | 89 ++++++++++++++++++++++++++++-------------------------- 2 files changed, 52 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d..4827094 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -60,7 +60,7 @@ #include #include #include - +#include #include #include #include @@ -397,7 +397,10 @@ struct sock { int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; - struct sk_buff *sk_send_head; + union { + struct sk_buff *sk_send_head; + struct rb_root tcp_rtx_queue; + }; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; diff --git a/include/net/tcp.h b/include/net/tcp.h index 744559b..5a95e58 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); -int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); +enum tcp_queue { + TCP_FRAG_IN_WRITE_QUEUE, + TCP_FRAG_IN_RTX_QUEUE, +}; +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, + unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); @@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) void tcp_write_queue_purge(struct sock *sk); +static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) +{ + return skb_rb_first(&sk->tcp_rtx_queue); +} + static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); @@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk, return skb_queue_prev(&sk->sk_write_queue, skb); } -#define tcp_for_write_queue(skb, sk) \ - skb_queue_walk(&(sk)->sk_write_queue, skb) - -#define tcp_for_write_queue_from(skb, sk) \ - skb_queue_walk_from(&(sk)->sk_write_queue, skb) - #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { - return sk->sk_send_head; + return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, @@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk, return skb_queue_is_last(&sk->sk_write_queue, skb); } -static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) +static inline bool tcp_write_queue_empty(const struct sock *sk) { - if (tcp_skb_is_last(sk, skb)) - sk->sk_send_head = NULL; - else - sk->sk_send_head = tcp_write_queue_next(sk, skb); + return skb_queue_empty(&sk->sk_write_queue); +} + +static inline bool tcp_rtx_queue_empty(const struct sock *sk) +{ + return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); +} + +static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) +{ + return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) { - if (sk->sk_send_head == skb_unlinked) { - sk->sk_send_head = NULL; + if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - } + if (tcp_sk(sk)->highest_sack == skb_unlinked) tcp_sk(sk)->highest_sack = NULL; } -static inline void tcp_init_send_head(struct sock *sk) -{ - sk->sk_send_head = NULL; -} - static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); @@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb __tcp_add_write_queue_tail(sk, skb); /* Queue it, remembering where we must start sending. */ - if (sk->sk_send_head == NULL) { - sk->sk_send_head = skb; + if (sk->sk_write_queue.next == skb) { tcp_chrono_start(sk, TCP_CHRONO_BUSY); if (tcp_sk(sk)->highest_sack == NULL) @@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s __skb_queue_head(&sk->sk_write_queue, skb); } -/* Insert buff after skb on the write queue of sk. */ -static inline void tcp_insert_write_queue_after(struct sk_buff *skb, - struct sk_buff *buff, - struct sock *sk) -{ - __skb_queue_after(&sk->sk_write_queue, skb, buff); -} - /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); - - if (sk->sk_send_head == skb) - sk->sk_send_head = new; } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { - list_del(&skb->tcp_tsorted_anchor); - tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -static inline bool tcp_write_queue_empty(struct sock *sk) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); + +static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { - return skb_queue_empty(&sk->sk_write_queue); + tcp_skb_tsorted_anchor_cleanup(skb); + rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); +} + +static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) +{ + list_del(&skb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) @@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { - tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL : - tcp_write_queue_next(sk, skb); + struct sk_buff *next = skb_rb_next(skb); + + tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) @@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk) static inline void tcp_highest_sack_reset(struct sock *sk) { - tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk); + struct sk_buff *skb = tcp_rtx_queue_head(sk); + + tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk); } /* Called when old skb is about to be deleted (to be combined with new skb) */ @@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { - const struct sk_buff *skb = tcp_write_queue_head(sk); + const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); -- cgit v1.1 From 180ca444b985c42948fa26abd278e616b5ce7eb2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:56 -0700 Subject: ipv6: introduce a new function fib6_update_sernum() This function takes a route as input and tries to update the sernum in the fib6_node this route is associated with. It will be used in later commit when adding a cached route into the exception table under that route. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d060d71..152b7b1 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -358,6 +358,8 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); +void fib6_update_sernum(struct rt6_info *rt); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); -- cgit v1.1 From 35732d01fe311ec13c4e42936878b782b8e7ea85 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:57 -0700 Subject: ipv6: introduce a hash table to store dst cache Add a hash table into struct rt6_info in order to store dst caches created by pmtu discovery and ip redirect in ipv6 routing code. APIs to add dst cache, delete dst cache, find dst cache and update dst cache in the hash table are implemented and will be used in later commits. This is a preparation work to move all cache routes into the exception table instead of getting inserted into the fib6 tree. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 19 +++++++++++++++++++ include/net/ip6_route.h | 3 +++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 152b7b1..c4864c1 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -98,6 +98,22 @@ struct rt6key { struct fib6_table; +struct rt6_exception_bucket { + struct hlist_head chain; + int depth; +}; + +struct rt6_exception { + struct hlist_node hlist; + struct rt6_info *rt6i; + unsigned long stamp; + struct rcu_head rcu; +}; + +#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 +#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) +#define FIB6_MAX_DEPTH 5 + struct rt6_info { struct dst_entry dst; @@ -134,12 +150,15 @@ struct rt6_info { struct inet6_dev *rt6i_idev; struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 exception_bucket_flushed:1, + unused:7; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ee96f40..3315605 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -95,6 +95,9 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); +void rt6_flush_exceptions(struct rt6_info *rt); +int rt6_remove_exception_rt(struct rt6_info *rt); + static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, unsigned int prefs, -- cgit v1.1 From c757faa8bfa26a0dd24b41ff783e0da042156887 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:01 -0700 Subject: ipv6: prepare fib6_age() for exception table If all dst cache entries are stored in the exception table under the main route, we have to go through them during fib6_age() when doing garbage collecting. Introduce a new function rt6_age_exception() which goes through all dst entries in the exception table and remove those entries that are expired. This function is called in fib6_age() so that all dst caches are also garbage collected. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 13 +++++++++++++ include/net/ip6_route.h | 2 ++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c4864c1..11a79ef 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -29,6 +29,14 @@ #define FIB6_TABLE_HASHSZ 1 #endif +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) pr_debug(x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + struct rt6_info; struct fib6_config { @@ -75,6 +83,11 @@ struct fib6_node { struct rcu_head rcu; }; +struct fib6_gc_args { + int timeout; + int more; +}; + #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3315605..a0087fb 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -97,6 +97,8 @@ int ip6_del_rt(struct rt6_info *); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); +void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, -- cgit v1.1 From 38fbeeeeccdb38d0635398e8e344d245f6d8dc52 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:02 -0700 Subject: ipv6: prepare fib6_locate() for exception table fib6_locate() is used to find the fib6_node according to the passed in prefix address key. It currently tries to find the fib6_node with the exact match of the passed in key. However, when we move cached routes into the exception table, fib6_locate() will fail to find the fib6_node for it as the cached routes will be stored in the exception table under the fib6_node with the longest prefix match of the cache's dst addr key. This commit adds a new parameter to let the caller specify if it needs exact match or longest prefix match. Right now, all callers still does exact match when calling fib6_locate(). It will be changed in later commit where exception table is hooked up to store cached routes. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 11a79ef..4497a1e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -357,7 +357,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len); + const struct in6_addr *saddr, int src_len, + bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); -- cgit v1.1 From 2b760fcf5cfb34e8610df56d83745b2b74ae1379 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:03 -0700 Subject: ipv6: hook up exception table to store dst cache This commit makes use of the exception hash table implementation to store dst caches created by pmtu discovery and ip redirect into the hash table under the rt_info and no longer inserts these routes into fib6 tree. This makes the fib6 tree only contain static configured routes and could now be protected by rcu instead of a rw lock. With this change, in the route lookup related functions, after finding the rt6_info with the longest prefix, we also need to search for the exception table before doing backtracking. In the route delete function, if the route being deleted is not a dst cache, deletion of this route also need to flush the whole hash table under it. If it is a dst cache, then only delete the cached dst in the hash table. Note: for fib6_walk_continue() function, w->root now is always pointing to a root node considering that fib6_prune_clones() is removed from the code. So we add a WARN_ON() msg to make sure w->root always points to a root node and also removed the update of w->root in fib6_repair_tree(). This is a prerequisite for later patch because we don't need to make w->root as rcu protected when replacing rwlock with RCU. Also, we remove all prune related variables as it is no longer used. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4497a1e..d0b7283 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -280,7 +280,6 @@ struct fib6_walker { struct fib6_node *root, *node; struct rt6_info *leaf; enum fib6_walk_state state; - bool prune; unsigned int skip; unsigned int count; int (*func)(struct fib6_walker *); -- cgit v1.1 From bbd63f06d114a52be33f6982fc89ca2768cdeb62 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:07 -0700 Subject: ipv6: update fn_sernum after route is inserted to tree fib6_add() logic currently calls fib6_add_1() to figure out what node should be used for the newly added route and then call fib6_add_rt2node() to insert the route to the node. And during the call of fib6_add_1(), fn_sernum is updated for all nodes that share the same prefix as the new route. This does not have issue in the current code because reader thread will not be able to access the tree while writer thread is inserting new route to it. However, it is not the case once we transition to use RCU. Reader thread could potentially see the new fn_sernum before the new route is inserted. As a result, reader thread's route lookup will return a stale route with the new fn_sernum. In order to solve this issue, we remove all the update of fn_sernum in fib6_add_1(), and instead, introduce a new function that updates fn_sernum for all related nodes and call this functions once the route is successfully inserted to the tree. Also, smp_wmb() is used after a route is successfully inserted into the fib tree and right before the updated of fn->sernum. And smp_rmb() is used right after fn->sernum is accessed in rt6_get_cookie_safe(). This is to guarantee that when the reader thread sees the new fn->sernum, the new route is already inserted in the tree in memory. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d0b7283..6bf929b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -220,6 +220,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, if (fn) { *cookie = fn->fn_sernum; + /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ + smp_rmb(); status = true; } -- cgit v1.1 From 66f5d6ce53e665477d2a33e8f539d4fa4ca81c83 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:10 -0700 Subject: ipv6: replace rwlock with rcu and spinlock in fib6_table With all the preparation work before, we are now ready to replace rwlock with rcu and spinlock in fib6_table. That means now all fib6_node in fib6_table are protected by rcu. And when freeing fib6_node, call_rcu() is used to wait for the rcu grace period before releasing the memory. When accessing fib6_node, corresponding rcu APIs need to be used. And all previous sessions protected by the write lock will now be protected by the spin lock per table. All previous sessions protected by read lock will now be protected by rcu_read_lock(). A couple of things to note here: 1. As part of the work of replacing rwlock with rcu, the linked list of fn->leaf now has to be rcu protected as well. So both fn->leaf and rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are used when manipulating them. 2. For fn->rr_ptr, first of all, it also needs to be rcu protected now and is tagged with __rcu and rcu APIs are used in corresponding places. Secondly, fn->rr_ptr is changed in rt6_select() which is a reader thread. This makes the issue a bit complicated. We think a valid solution for it is to let rt6_select() grab the tb6_lock if it decides to change it. As it is not in the normal operation and only happens when there is no valid neighbor cache for the route, we think the performance impact should be low. 3. fib6_walk_continue() has to be called with tb6_lock held even in the route dumping related functions, e.g. inet6_dump_fib(), fib6_tables_dump() and ipv6_route_seq_ops. It is because fib6_walk_continue() makes modifications to the walker structure, and so are fib6_repair_tree() and fib6_del_route(). In order to do proper syncing between them, we need to let fib6_walk_continue() hold the lock. We may be able to do further improvement on the way we do the tree walk to get rid of the need for holding the spin lock. But not for now. 4. When fib6_del_route() removes a route from the tree, we no longer mark rt->dst.rt6_next to NULL to make simultaneous reader be able to further traverse the list with rcu. However, rt->dst.rt6_next is only valid within this same rcu period. No one should access it later. 5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be performed before we publish this route (either by linking it to fn->leaf or insert it in the list pointed by fn->leaf) just to be safe because as soon as we publish the route, some read thread will be able to access it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- include/net/ip6_fib.h | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 06a6765..204c19e 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,7 @@ struct dst_entry { union { struct dst_entry *next; struct rtable __rcu *rt_next; - struct rt6_info *rt6_next; + struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6bf929b..0b438b9 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -68,18 +68,18 @@ struct fib6_config { }; struct fib6_node { - struct fib6_node *parent; - struct fib6_node *left; - struct fib6_node *right; + struct fib6_node __rcu *parent; + struct fib6_node __rcu *left; + struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES - struct fib6_node *subtree; + struct fib6_node __rcu *subtree; #endif - struct rt6_info *leaf; + struct rt6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info *rr_ptr; + struct rt6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -91,7 +91,7 @@ struct fib6_gc_args { #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else -#define FIB6_SUBTREE(fn) ((fn)->subtree) +#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif struct mx6_config { @@ -174,6 +174,14 @@ struct rt6_info { unused:7; }; +#define for_each_fib6_node_rt_rcu(fn) \ + for (rt = rcu_dereference((fn)->leaf); rt; \ + rt = rcu_dereference(rt->dst.rt6_next)) + +#define for_each_fib6_walker_rt(w) \ + for (rt = (w)->leaf; rt; \ + rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; @@ -310,7 +318,7 @@ struct rt6_statistics { struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; - rwlock_t tb6_lock; + spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; -- cgit v1.1 From 81eb8447daae3b62247aa66bb17b82f8fef68249 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:11 -0700 Subject: ipv6: take care of rt6_stats Currently, most of the rt6_stats are not hooked up correctly. As the last part of this patch series, hook up all existing rt6_stats and add one new stat fib_rt_uncache to indicate the number of routes in the uncached list. For details of the stats, please refer to the comments added in include/net/ip6_fib.h. Note: fib_rt_alloc and fib_rt_uncache are not guaranteed to be modified under a lock. So atomic_t is used for them. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0b438b9..10c9138 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -297,12 +297,15 @@ struct fib6_walker { }; struct rt6_statistics { - __u32 fib_nodes; - __u32 fib_route_nodes; - __u32 fib_rt_alloc; /* permanent routes */ - __u32 fib_rt_entries; /* rt entries in table */ - __u32 fib_rt_cache; /* cache routes */ - __u32 fib_discarded_routes; + __u32 fib_nodes; /* all fib6 nodes */ + __u32 fib_route_nodes; /* intermediate nodes */ + __u32 fib_rt_entries; /* rt entries in fib table */ + __u32 fib_rt_cache; /* cached rt entries in exception table */ + __u32 fib_discarded_routes; /* total number of routes delete */ + + /* The following stats are not protected by any lock */ + atomic_t fib_rt_alloc; /* total number of routes alloced */ + atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 -- cgit v1.1 From bdc476413dcdb5c38a7dec90fb2bca327021273a Mon Sep 17 00:00:00 2001 From: Amine Kherbouche Date: Wed, 4 Oct 2017 19:35:57 +0200 Subject: ip_tunnel: add mpls over gre support This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel API by simply adding ipgre_tunnel_encap_(add|del)_mpls_ops() and the new tunnel type TUNNEL_ENCAP_MPLS. Signed-off-by: Amine Kherbouche Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 2e52088..a2f48c0 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -84,6 +84,7 @@ enum tunnel_encap_types { TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_FOU, TUNNEL_ENCAP_GUE, + TUNNEL_ENCAP_MPLS, }; #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) -- cgit v1.1 From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: bpf: perf event change needed for subsequent bpf helpers This patch does not impact existing functionalities. It contains the changes in perf event area needed for subsequent bpf_perf_event_read_value and bpf_perf_prog_read_value helpers. Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- include/linux/perf_event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8e22f24..79b18a2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -806,6 +806,7 @@ struct perf_output_handle { struct bpf_perf_event_data_kern { struct pt_regs *regs; struct perf_sample_data *data; + struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF @@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value); +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline int perf_event_read_local(struct perf_event *event, u64 *value) +static inline int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { return -EINVAL; } -- cgit v1.1 From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:20 -0700 Subject: bpf: add helper bpf_perf_event_read_value for perf event array map Hardware pmu counters are limited resources. When there are more pmu based perf events opened than available counters, kernel will multiplex these events so each event gets certain percentage (but not 100%) of the pmu time. In case that multiplexing happens, the number of samples or counter value will not reflect the case compared to no multiplexing. This makes comparison between different runs difficult. Typically, the number of samples or counter value should be normalized before comparing to other experiments. The typical normalization is done like: normalized_num_samples = num_samples * time_enabled / time_running normalized_counter_value = counter_value * time_enabled / time_running where time_enabled is the time enabled for event and time_running is the time running for event since last normalization. This patch adds helper bpf_perf_event_read_value for kprobed based perf event array map, to read perf counter and enabled/running time. The enabled/running time is accumulated since the perf event open. To achieve scaling factor between two bpf invocations, users can can use cpu_id as the key (which is typical for perf array usage model) to remember the previous value and do the calculation inside the bpf program. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6082faf..7b57a21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -641,6 +641,14 @@ union bpf_attr { * @xdp_md: pointer to xdp_md * @delta: An positive/negative integer to be added to xdp_md.data_meta * Return: 0 on success or negative on error + * + * int bpf_perf_event_read_value(map, flags, buf, buf_size) + * read perf event counter value and perf event enabled/running time + * @map: pointer to perf_event_array map + * @flags: index of event in the map or bitmask flags + * @buf: buf to fill + * @buf_size: size of the buf + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -697,7 +705,8 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ - FN(xdp_adjust_meta), + FN(xdp_adjust_meta), \ + FN(perf_event_read_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -741,7 +750,9 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK /* BPF_FUNC_perf_event_output for sk_buff input context. */ @@ -934,4 +945,10 @@ enum { #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.1 From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:22 -0700 Subject: bpf: add helper bpf_perf_prog_read_value This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf programs, to read event counter and enabled/running time. The enabled/running time is accumulated since the perf event open. The typical use case for perf event based bpf program is to attach itself to a single event. In such cases, if it is desirable to get scaling factor between two bpf invocations, users can can save the time values in a map, and use the value from the map and the current value to calculate the scaling factor. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7b57a21..5bbbec1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -649,6 +649,13 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return: 0 on success or negative error code + * + * int bpf_perf_prog_read_value(ctx, buf, buf_size) + * read perf prog attached perf event counter and enabled/running time + * @ctx: pointer to ctx + * @buf: buf to fill + * @buf_size: size of the buf + * Return : 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -706,7 +713,8 @@ union bpf_attr { FN(sk_redirect_map), \ FN(sock_map_update), \ FN(xdp_adjust_meta), \ - FN(perf_event_read_value), + FN(perf_event_read_value), \ + FN(perf_prog_read_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.1 From 64237470ddf97b63155fbd272c9e743e01d5f514 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 01:37:29 +0800 Subject: net: phonet: mark header_ops as const Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- include/linux/if_phonet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_phonet.h b/include/linux/if_phonet.h index bbcdb0a..a118ee4 100644 --- a/include/linux/if_phonet.h +++ b/include/linux/if_phonet.h @@ -10,5 +10,5 @@ #include -extern struct header_ops phonet_header_ops; +extern const struct header_ops phonet_header_ops; #endif -- cgit v1.1 From 548ec114705bb8f0879a0da12abec17f17a7cc26 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 01:40:35 +0800 Subject: net: phonet: mark phonet_protocol as const The phonet_protocol structs don't need to be written by anyone and so can be marked as const. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- include/net/phonet/phonet.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 039cc29..51e1a2a 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -108,8 +108,10 @@ struct phonet_protocol { int sock_type; }; -int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp); -void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp); +int phonet_proto_register(unsigned int protocol, + const struct phonet_protocol *pp); +void phonet_proto_unregister(unsigned int protocol, + const struct phonet_protocol *pp); int phonet_sysctl_init(void); void phonet_sysctl_exit(void); -- cgit v1.1 From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:12 -0700 Subject: bpf: Use char in prog and map name Instead of u8, use char for prog and map name. It can avoid the userspace tool getting compiler's signess warning. The bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and bpf_map_info are changed. Signed-off-by: Martin KaFai Lau Cc: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 ++-- include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a67daea..bc7da2d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,7 +56,7 @@ struct bpf_map { struct work_struct work; atomic_t usercnt; struct bpf_map *inner_map_meta; - u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; }; /* function argument constraints */ @@ -189,7 +189,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ - u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5bbbec1..6db9e1d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,7 +230,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ - __u8 map_name[BPF_OBJ_NAME_LEN]; + char map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -253,7 +253,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; - __u8 prog_name[BPF_OBJ_NAME_LEN]; + char prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -888,7 +888,7 @@ struct bpf_prog_info { __u32 created_by_uid; __u32 nr_map_ids; __aligned_u64 map_ids; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { @@ -898,7 +898,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.1 From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 6 Oct 2017 22:12:37 -0700 Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to suppress arp and nd flood on bridge ports. It implements rfc7432, section 10. https://tools.ietf.org/html/rfc7432#section-10 for ethernet VPN deployments. It is similar to the existing BR_PROXYARP* flags but has a few semantic differences to conform to EVPN standard. Unlike the existing flags, this new flag suppresses flood of all neigh discovery packets (arp and nd) to tunnel ports. Supports both vlan filtering and non-vlan filtering bridges. In case of EVPN, it is mainly used to avoid flooding of arp and nd packets to tunnel ports like vxlan. This patch adds netlink and sysfs support to set this bridge port flag. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3cd18ac..316ee11 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -49,6 +49,7 @@ struct br_ip_list { #define BR_MULTICAST_TO_UNICAST BIT(12) #define BR_VLAN_TUNNEL BIT(13) #define BR_BCAST_FLOOD BIT(14) +#define BR_NEIGH_SUPPRESS BIT(15) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cd580fc..b037e0a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -327,6 +327,7 @@ enum { IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.1 From 77041420751fe6d4acf2103b245dcc2b4b7b8360 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 9 Oct 2017 11:15:31 +0200 Subject: net: bridge: Notify on bridge device mrouter state changes Add the SWITCHDEV_ATTR_ID_BRIDGE_MROUTER switchdev notification type, used to indicate whether the bridge is or isn't mrouter. Notify when the bridge changes its state, similarly to the already existing bridged port mrouter notifications. The notification uses the switchdev_attr.u.mrouter boolean flag to indicate the current bridge mrouter status. Thus, it only indicates whether the bridge is currently used as an mrouter or not, and does not indicate the exact mrouter state of the bridge (learning, permanent, etc.). Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d767b79..d756fbe 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -51,6 +51,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, + SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, }; struct switchdev_attr { -- cgit v1.1 From 0912bda436388a02c72164b4b490b578e64c012e Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 9 Oct 2017 11:15:32 +0200 Subject: net: bridge: Export bridge multicast router state Add an access function that, given a bridge netdevice, returns whether the bridge device is currently an mrouter or not. The function uses the already existing br_multicast_is_router function to check that. This function is needed in order to allow ports that join an already existing bridge to know the current mrouter state of the bridge device. Together with the bridge device mrouter ports switchdev notifications, it is possible to have full offloading of the semantics of the bridge device mcast router state. Due to the fact that the bridge multicast router status can change in packet RX path, take the multicast_router bridge spinlock to protect the read. Signed-off-by: Yotam Gigi Reviewed-by: Nogah Frankel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 316ee11..02639eb 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -64,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev, bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto); bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); +bool br_multicast_router(const struct net_device *dev); #else static inline int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) @@ -84,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev) { return false; } +static inline bool br_multicast_router(const struct net_device *dev) +{ + return false; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) -- cgit v1.1 From ed468ebee04ffba0231a8f50616bdb250752a891 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Mon, 9 Oct 2017 12:37:44 +0300 Subject: qed: Add ll2 ability of opening a secondary queue When more than one ll2 queue is opened ( that is not an OOO queue ) ll2 code does not have enough information to determine whether the queue is the main one or not, so a new field is added to the acquire input data to expose the control of determining whether the queue is the main queue or a secondary queue. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_ll2_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 89fa0bb..d7cca59 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -171,6 +171,7 @@ struct qed_ll2_acquire_data_inputs { enum qed_ll2_tx_dest tx_dest; enum qed_ll2_error_handle ai_err_packet_too_big; enum qed_ll2_error_handle ai_err_no_buf; + bool secondary_queue; u8 gsi_enable; }; -- cgit v1.1 From 77caa792f5d8e4ecc88eb1cf4b9c478c07e0ec57 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Mon, 9 Oct 2017 12:37:45 +0300 Subject: qed: Add ll2 option for dropping a tx packet The option of sending a packet on the ll2 and dropping it exists in hardware and was not used until now, thus not exposed. The iWARP unaligned MPA flow requires this functionality for flushing the tx queue. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_ll2_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index d7cca59..95fdf02 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type { enum qed_ll2_tx_dest { QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */ QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */ + QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */ QED_LL2_TX_DEST_MAX }; -- cgit v1.1 From 6f34a284f36399501fcc034dc4522a2d8d9fa6c9 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Mon, 9 Oct 2017 12:37:48 +0300 Subject: qed: Add LL2 slowpath handling For iWARP unaligned MPA flow, a slowpath event of flushing an MPA connection that entered an unaligned state is required. The flush ramrod is received on the ll2 queue, and a pre-registered callback function is called to handle the flush event. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_ll2_if.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 95fdf02..e755954 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -151,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt, dma_addr_t first_frag_addr, bool b_last_fragment, bool b_last_packet); +typedef +void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle, + u32 opaque_data_0, u32 opaque_data_1); + struct qed_ll2_cbs { qed_ll2_complete_rx_packet_cb rx_comp_cb; qed_ll2_release_rx_packet_cb rx_release_cb; qed_ll2_complete_tx_packet_cb tx_comp_cb; qed_ll2_release_tx_packet_cb tx_release_cb; + qed_ll2_slowpath_cb slowpath_cb; void *cookie; }; -- cgit v1.1 From cf4c950b87ee2f547ad3abd3aca6ae3f3eb3443f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 14:30:52 -0700 Subject: once: switch to new jump label API Switch the DO_ONCE() macro from the deprecated jump label API to the new one. The new one is more readable, and for DO_ONCE() it also makes the generated code more icache-friendly: now the one-time initialization code is placed out-of-line at the jump target, rather than at the inline fallthrough case. Acked-by: Hannes Frederic Sowa Signed-off-by: Eric Biggers Signed-off-by: David S. Miller --- include/linux/once.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/once.h b/include/linux/once.h index 9c98aaa..7247249 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -5,7 +5,7 @@ #include bool __do_once_start(bool *done, unsigned long *flags); -void __do_once_done(bool *done, struct static_key *once_key, +void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags); /* Call a function exactly once. The idea of DO_ONCE() is to perform @@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key, ({ \ bool ___ret = false; \ static bool ___done = false; \ - static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \ - if (static_key_true(&___once_key)) { \ + static DEFINE_STATIC_KEY_TRUE(___once_key); \ + if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ ___ret = __do_once_start(&___done, &___flags); \ if (unlikely(___ret)) { \ -- cgit v1.1 From ceaa001a170e43608854d5290a48064f57b565ed Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 4 Oct 2017 17:03:12 -0700 Subject: openvswitch: Add erspan tunnel support. Add erspan netlink interface for OVS. Signed-off-by: William Tu Cc: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 156ee4c..efdbfbf 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, + OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* be32 ERSPAN index. */ __OVS_TUNNEL_KEY_ATTR_MAX }; -- cgit v1.1 From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:10 -0700 Subject: bpf: encapsulate verifier log state into a structure Put the loose log_* variables into a structure. This will make it simpler to remove the global verifier state in following patches. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b8d200f..163541b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -115,6 +115,19 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +struct bpf_verifer_log { + u32 level; + char *kbuf; + char __user *ubuf; + u32 len_used; + u32 len_total; +}; + +static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +{ + return log->len_used >= log->len_total - 1; +} + struct bpf_verifier_env; struct bpf_ext_analyzer_ops { int (*insn_hook)(struct bpf_verifier_env *env, -- cgit v1.1 From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:11 -0700 Subject: bpf: move global verifier log into verifier environment The biggest piece of global state protected by the verifier lock is the verifier_log. Move that log to struct bpf_verifier_env. struct bpf_verifier_env has to be passed now to all invocations of verbose(). Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 163541b..5ddb9a6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -152,6 +152,8 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + + struct bpf_verifer_log log; }; int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, -- cgit v1.1 From d66f2b91f95b56e31772b9faa0d036cd2e53cb02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:14 -0700 Subject: bpf: don't rely on the verifier lock for metadata_dst allocation bpf_skb_set_tunnel_*() functions require allocation of per-cpu metadata_dst. The allocation happens upon verification of the first program using those helpers. In preparation for removing the verifier lock, use cmpxchg() to make sure we only allocate the metadata_dsts once. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 9fba2eb..87a0bb8 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -87,6 +87,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); -- cgit v1.1 From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:15 -0700 Subject: bpf: write back the verifier log buffer as it gets filled Verifier log buffer can be quite large (up to 16MB currently). As Eric Dumazet points out if we allow multiple verification requests to proceed simultaneously, malicious user may use the verifier as a way of allocating large amounts of unswappable memory to OOM the host. Switch to a strategy of allocating a smaller buffer (1024B) and writing it out into the user buffer after every print. While at it remove the old BUG_ON(). This is in preparation of the global verifier lock removal. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5ddb9a6..f00ef75 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -115,9 +115,11 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +#define BPF_VERIFIER_TMP_LOG_SIZE 1024 + struct bpf_verifer_log { u32 level; - char *kbuf; + char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; char __user *ubuf; u32 len_used; u32 len_total; -- cgit v1.1 From b8226962b1c49c784aeddb9d2fafbf53dfdc2190 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 10 Oct 2017 16:54:44 -0400 Subject: openvswitch: add ct_clear action This adds a ct_clear action for clearing conntrack state. ct_clear is currently implemented in OVS userspace, but is not backed by an action in the kernel datapath. This is useful for flows that may modify a packet tuple after a ct lookup has already occurred. Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index efdbfbf..0cd6f88 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -807,6 +807,7 @@ struct ovs_action_push_eth { * packet. * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the * packet. + * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -836,6 +837,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */ OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */ OVS_ACTION_ATTR_POP_ETH, /* No argument. */ + OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit v1.1 From 8c418b5b15747eda05d086e80fa0a767982fbf37 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Oct 2017 11:53:32 +0200 Subject: fq: support filtering a given tin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add to the FQ API a way to filter a given tin, in order to remove frames that fulfil certain criteria according to a filter function. This will be used by mac80211 to remove frames belonging to an AP VLAN interface that's being removed. Signed-off-by: Johannes Berg Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq.h | 7 +++++ include/net/fq_impl.h | 72 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/fq.h b/include/net/fq.h index 6d8521a..ac944a6 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -90,6 +90,13 @@ typedef void fq_skb_free_t(struct fq *, struct fq_flow *, struct sk_buff *); +/* Return %true to filter (drop) the frame. */ +typedef bool fq_skb_filter_t(struct fq *, + struct fq_tin *, + struct fq_flow *, + struct sk_buff *, + void *); + typedef struct fq_flow *fq_flow_get_default_t(struct fq *, struct fq_tin *, int idx, diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 4e6131c..8b237e4 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -12,24 +12,22 @@ /* functions that are embedded into includer */ -static struct sk_buff *fq_flow_dequeue(struct fq *fq, - struct fq_flow *flow) +static void fq_adjust_removal(struct fq *fq, + struct fq_flow *flow, + struct sk_buff *skb) { struct fq_tin *tin = flow->tin; - struct fq_flow *i; - struct sk_buff *skb; - - lockdep_assert_held(&fq->lock); - - skb = __skb_dequeue(&flow->queue); - if (!skb) - return NULL; tin->backlog_bytes -= skb->len; tin->backlog_packets--; flow->backlog -= skb->len; fq->backlog--; fq->memory_usage -= skb->truesize; +} + +static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow) +{ + struct fq_flow *i; if (flow->backlog == 0) { list_del_init(&flow->backlogchain); @@ -43,6 +41,21 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq, list_move_tail(&flow->backlogchain, &i->backlogchain); } +} + +static struct sk_buff *fq_flow_dequeue(struct fq *fq, + struct fq_flow *flow) +{ + struct sk_buff *skb; + + lockdep_assert_held(&fq->lock); + + skb = __skb_dequeue(&flow->queue); + if (!skb) + return NULL; + + fq_adjust_removal(fq, flow, skb); + fq_rejigger_backlog(fq, flow); return skb; } @@ -188,6 +201,45 @@ static void fq_tin_enqueue(struct fq *fq, } } +static void fq_flow_filter(struct fq *fq, + struct fq_flow *flow, + fq_skb_filter_t filter_func, + void *filter_data, + fq_skb_free_t free_func) +{ + struct fq_tin *tin = flow->tin; + struct sk_buff *skb, *tmp; + + lockdep_assert_held(&fq->lock); + + skb_queue_walk_safe(&flow->queue, skb, tmp) { + if (!filter_func(fq, tin, flow, skb, filter_data)) + continue; + + __skb_unlink(skb, &flow->queue); + fq_adjust_removal(fq, flow, skb); + free_func(fq, tin, flow, skb); + } + + fq_rejigger_backlog(fq, flow); +} + +static void fq_tin_filter(struct fq *fq, + struct fq_tin *tin, + fq_skb_filter_t filter_func, + void *filter_data, + fq_skb_free_t free_func) +{ + struct fq_flow *flow; + + lockdep_assert_held(&fq->lock); + + list_for_each_entry(flow, &tin->new_flows, flowchain) + fq_flow_filter(fq, flow, filter_func, filter_data, free_func); + list_for_each_entry(flow, &tin->old_flows, flowchain) + fq_flow_filter(fq, flow, filter_func, filter_data, free_func); +} + static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) -- cgit v1.1 From 1ea4ff3e9f0b8d53e680a2bb9e8e644bf03aeb4d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Sep 2017 16:07:22 +0200 Subject: cfg80211: support reloading regulatory database If the regulatory database is loaded, and then updated, it may be necessary to reload it. Add an nl80211 command to do this. Note that this just reloads the database, it doesn't re-apply the rules from it immediately. Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 95832ce..f882fe1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -990,6 +990,8 @@ * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed * &NL80211_CMD_DISCONNECT should be indicated instead. * + * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1194,6 +1196,8 @@ enum nl80211_commands { NL80211_CMD_PORT_AUTHORIZED, + NL80211_CMD_RELOAD_REGDB, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ -- cgit v1.1 From 4a269818a7eb8577d32d8b2879099c689ddbd856 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Oct 2017 13:27:29 -0700 Subject: tcp: fix tcp_unlink_write_queue() Yury reported crash with this signature : [ 554.034021] [] 0xffff80003ccd5a58 [ 554.034156] [] skb_release_all+0x14/0x30 [ 554.034288] [] __kfree_skb+0x14/0x28 [ 554.034409] [] tcp_sendmsg_locked+0x4dc/0xcc8 [ 554.034541] [] tcp_sendmsg+0x34/0x58 [ 554.034659] [] inet_sendmsg+0x2c/0xf8 [ 554.034783] [] sock_sendmsg+0x18/0x30 [ 554.034928] [] SyS_sendto+0x84/0xf8 Problem is that skb->destructor contains garbage, and this is because I accidentally removed tcp_skb_tsorted_anchor_cleanup() from tcp_unlink_write_queue() This would trigger with a write(fd, , len) attempt, and we will add to packetdrill this capability to avoid future regressions. Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Reported-by: Yury Norov Tested-by: Yury Norov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a95e58..1516345 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1712,6 +1712,7 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new, static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { + tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -- cgit v1.1 From 28978713c51b0a70acf748f76f9d6d2d20dcf980 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:18 -0700 Subject: net: qrtr: Move constants to header file The constants are used by both the name server and clients, so clarify their value and move them to the uapi header. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- include/uapi/linux/qrtr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h index 9d76c56..63e8803 100644 --- a/include/uapi/linux/qrtr.h +++ b/include/uapi/linux/qrtr.h @@ -4,6 +4,9 @@ #include #include +#define QRTR_NODE_BCAST 0xffffffffu +#define QRTR_PORT_CTRL 0xfffffffeu + struct sockaddr_qrtr { __kernel_sa_family_t sq_family; __u32 sq_node; -- cgit v1.1 From da7653f0faabbe45eb2d3fd6e4b400fe003e81ae Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:19 -0700 Subject: net: qrtr: Add control packet definition to uapi The QMUX protocol specification defines structure of the special control packet messages being sent between handlers of the control port. Add these to the uapi header, as this structure and the associated types are shared between the kernel and all userspace handlers of control messages. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- include/uapi/linux/qrtr.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h index 63e8803..179af648 100644 --- a/include/uapi/linux/qrtr.h +++ b/include/uapi/linux/qrtr.h @@ -13,4 +13,36 @@ struct sockaddr_qrtr { __u32 sq_port; }; +enum qrtr_pkt_type { + QRTR_TYPE_DATA = 1, + QRTR_TYPE_HELLO = 2, + QRTR_TYPE_BYE = 3, + QRTR_TYPE_NEW_SERVER = 4, + QRTR_TYPE_DEL_SERVER = 5, + QRTR_TYPE_DEL_CLIENT = 6, + QRTR_TYPE_RESUME_TX = 7, + QRTR_TYPE_EXIT = 8, + QRTR_TYPE_PING = 9, + QRTR_TYPE_NEW_LOOKUP = 10, + QRTR_TYPE_DEL_LOOKUP = 11, +}; + +struct qrtr_ctrl_pkt { + __le32 cmd; + + union { + struct { + __le32 service; + __le32 instance; + __le32 node; + __le32 port; + } server; + + struct { + __le32 node; + __le32 port; + } client; + }; +} __packed; + #endif /* _LINUX_QRTR_H */ -- cgit v1.1 From 843e79d05addd8eb06992cd6dfafc7b9d53f2bc8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:07 +0200 Subject: net: sched: make tc_action_ops->get_dev return dev and avoid passing net Return dev directly, NULL if not possible. That is enough. Makes no sense to pass struct net * to get_dev op, as there is only one net possible, the one the action was created in. So just store it in mirred priv and use directly. Rename the mirred op callback function. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 3 +-- include/net/tc_act/tc_mirred.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index b944e0e..900168a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -93,8 +93,7 @@ struct tc_action_ops { int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); - int (*get_dev)(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev); + struct net_device *(*get_dev)(const struct tc_action *a); }; struct tc_action_net { diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 604bc31..21a6565 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -10,6 +10,7 @@ struct tcf_mirred { int tcfm_ifindex; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; + struct net *net; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) -- cgit v1.1 From b3f55bdda8df55a563005e00b1b71212d8546541 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:08 +0200 Subject: net: sched: introduce per-egress action device callbacks Introduce infrastructure that allows drivers to register callbacks that are called whenever tc would offload inserted rule and specified device acts as tc action egress device. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 34 ++++++++++++++++++++++++++++++++++ include/net/pkt_cls.h | 2 ++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 900168a..f5e8c90 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -174,4 +174,38 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, #endif } +typedef int tc_setup_cb_t(enum tc_setup_type type, + void *type_data, void *cb_priv); + +#ifdef CONFIG_NET_CLS_ACT +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv); +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv); +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop); +#else +static inline +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + return 0; +} + +static inline +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ +} + +static inline +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + return 0; +} +#endif + #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e80edd8..6f8149c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -206,6 +206,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, struct net_device **hw_dev); +int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop); /** * struct tcf_pkt_info - packet information -- cgit v1.1 From 717503b9cf57c0bb7ea4d3a9f5699c9a04adf988 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:09 +0200 Subject: net: sched: convert cls_flower->egress_dev users to tc_setup_cb_egdev infra The only user of cls_flower->egress_dev is mlx5. So do the conversion there alongside with the code originating the call in cls_flower function fl_hw_replace_filter to the newly introduced egress device callback infrastucture. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6f8149c..c0bdf5c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -206,8 +206,6 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, struct net_device **hw_dev); -int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, - void *type_data, bool err_stop); /** * struct tcf_pkt_info - packet information @@ -407,6 +405,9 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ +int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop); + struct tc_cls_common_offload { u32 chain_index; __be16 protocol; -- cgit v1.1 From 7578d7b45ed870b13a8ace57e32feaed623c2a94 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:10 +0200 Subject: net: sched: remove unused tcf_exts_get_dev helper and cls_flower->egress_dev The helper and the struct field ares no longer used by any code, so remove them. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c0bdf5c..f526374 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -204,8 +204,6 @@ void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev); /** * struct tcf_pkt_info - packet information @@ -517,7 +515,6 @@ struct tc_cls_flower_offload { struct fl_flow_key *mask; struct fl_flow_key *key; struct tcf_exts *exts; - bool egress_dev; }; enum tc_matchall_command { -- cgit v1.1 From 437d2762ba07f0fc639d5a09acb323fe4106a61f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Oct 2017 20:45:40 -0700 Subject: tcp: remove obsolete helpers Remove three inline helpers that are no longer needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1516345..3b3b9b9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1629,18 +1629,6 @@ static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) return skb_peek_tail(&sk->sk_write_queue); } -static inline struct sk_buff *tcp_write_queue_next(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb_queue_next(&sk->sk_write_queue, skb); -} - -static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb_queue_prev(&sk->sk_write_queue, skb); -} - #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) @@ -1697,11 +1685,6 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb } } -static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb) -{ - __skb_queue_head(&sk->sk_write_queue, skb); -} - /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, -- cgit v1.1 From 60724d4bae14cd295b27b1610cad9a2720eb0860 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 10:57:48 -0700 Subject: net: dsa: Add support for DSA specific notifiers In preparation for communicating a given DSA network device's port number and switch index, create a specialized DSA notifier and two events: DSA_PORT_REGISTER and DSA_PORT_UNREGISTER that communicate: the slave network device (slave_dev), port number and switch number in the tree. This will be later used for network device drivers like bcmsysport which needs to cooperate with its DSA network devices to set-up queue mapping and scheduling. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 10dcecc..40a709a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -471,4 +471,49 @@ static inline int dsa_switch_resume(struct dsa_switch *ds) } #endif /* CONFIG_PM_SLEEP */ +enum dsa_notifier_type { + DSA_PORT_REGISTER, + DSA_PORT_UNREGISTER, +}; + +struct dsa_notifier_info { + struct net_device *dev; +}; + +struct dsa_notifier_register_info { + struct dsa_notifier_info info; /* must be first */ + struct net_device *master; + unsigned int port_number; + unsigned int switch_number; +}; + +static inline struct net_device * +dsa_notifier_info_to_dev(const struct dsa_notifier_info *info) +{ + return info->dev; +} + +#if IS_ENABLED(CONFIG_NET_DSA) +int register_dsa_notifier(struct notifier_block *nb); +int unregister_dsa_notifier(struct notifier_block *nb); +int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info); +#else +static inline int register_dsa_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int unregister_dsa_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info) +{ + return NOTIFY_DONE; +} +#endif + #endif -- cgit v1.1 From 0a5f14ce67a6e093e651d3cd75e6ac281123d93a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 10:57:49 -0700 Subject: net: dsa: tag_brcm: Indicate to master netdevice port + queue We need to tell the DSA master network device doing the actual transmission what the desired switch port and queue number is for it to resolve that to the internal transmit queue it is mapped to. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 40a709a..ce1d622 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -516,4 +516,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, } #endif +/* Broadcom tag specific helpers to insert and extract queue/port number */ +#define BRCM_TAG_SET_PORT_QUEUE(p, q) ((p) << 8 | q) +#define BRCM_TAG_GET_PORT(v) ((v) >> 8) +#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) + #endif -- cgit v1.1 From ad2d116c5242875bba27522682ec5ba7f0df75f0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 11:14:49 -0700 Subject: sched: tc_mirred: Remove whitespaces This file contains unnecessary whitespaces as newlines, remove them, found by looking at what struct tc_mirred looks like. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_mirred.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h index 3d7a2b3..69038c2 100644 --- a/include/uapi/linux/tc_act/tc_mirred.h +++ b/include/uapi/linux/tc_act/tc_mirred.h @@ -9,13 +9,13 @@ #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */ #define TCA_INGRESS_REDIR 3 /* packet redirect to INGRESS*/ #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */ - + struct tc_mirred { tc_gen; int eaction; /* one of IN/EGRESS_MIRROR/REDIR */ __u32 ifindex; /* ifindex of egress port */ }; - + enum { TCA_MIRRED_UNSPEC, TCA_MIRRED_TM, @@ -24,5 +24,5 @@ enum { __TCA_MIRRED_MAX }; #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1) - + #endif -- cgit v1.1 From 8f04748016f3b583e675e0f649d42cfc10812a8b Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 11 Oct 2017 10:50:29 -0400 Subject: net sched actions: change IFE modules alias names Make style of module alias name consistent with other subsystems in kernel, for example net devices. Fixes: 084e2f6566d2 ("Support to encoding decoding skb mark on IFE action") Fixes: 200e10f46936 ("Support to encoding decoding skb prio on IFE action") Fixes: 408fbc22ef1e ("net sched ife action: Introduce skb tcindex metadata encap decap") Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 30ba459..104578f 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -40,7 +40,7 @@ struct tcf_meta_ops { struct module *owner; }; -#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ifemeta" __stringify_1(metan)) +#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ife-meta-" metan) int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi); -- cgit v1.1 From aa9fd9a325d51fa0b11153b03b8fefff569fa955 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 11 Oct 2017 17:16:08 -0400 Subject: sched: act: ife: update parameters via rcu handling This patch changes the parameter updating via RCU and not protected by a spinlock anymore. This reduce the time that the spinlock is being held. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 104578f..c7fb99c 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -6,12 +6,18 @@ #include #include -struct tcf_ife_info { - struct tc_action common; +struct tcf_ife_params { u8 eth_dst[ETH_ALEN]; u8 eth_src[ETH_ALEN]; u16 eth_type; u16 flags; + + struct rcu_head rcu; +}; + +struct tcf_ife_info { + struct tc_action common; + struct tcf_ife_params __rcu *params; /* list of metaids allowed */ struct list_head metalist; }; -- cgit v1.1 From 2355a6546a053b1c16ebefd6ce1f0cccc00e1da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 12 Oct 2017 10:21:25 +0200 Subject: net: phy: broadcom: support new device flag for setting master mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of Broadcom's PHYs run by default in slave mode with Automatic Slave/Master configuration disabled. It stops them from working properly with some devices. So far it has been verified for BCM54210E and BCM50212E which don't work well with Intel's I217-LM and I218-LM: http://ark.intel.com/products/60019/Intel-Ethernet-Connection-I217-LM http://ark.intel.com/products/71307/Intel-Ethernet-Connection-I218-LM I was told there is massive ping loss. This commit adds support for a new flag which can be set by an ethernet driver to fixup PHY setup. Signed-off-by: Rafał Miłecki Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index abcda9b..9ac9e3e 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -63,6 +63,7 @@ #define PHY_BRCM_EXT_IBND_TX_ENABLE 0x00002000 #define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 +#define PHY_BRCM_EN_MASTER_MODE 0x00010000 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) -- cgit v1.1 From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:23 +0200 Subject: tipc: introduce communication groups As a preparation for introducing flow control for multicast and datagram messaging we need a more strictly defined framework than we have now. A socket must be able keep track of exactly how many and which other sockets it is allowed to communicate with at any moment, and keep the necessary state for those. We therefore introduce a new concept we have named Communication Group. Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN. The call takes four parameters: 'type' serves as group identifier, 'instance' serves as an logical member identifier, and 'scope' indicates the visibility of the group (node/cluster/zone). Finally, 'flags' makes it possible to set certain properties for the member. For now, there is only one flag, indicating if the creator of the socket wants to receive a copy of broadcast or multicast messages it is sending via the socket, and if wants to be eligible as destination for its own anycasts. A group is closed, i.e., sockets which have not joined a group will not be able to send messages to or receive messages from members of the group, and vice versa. Any member of a group can send multicast ('group broadcast') messages to all group members, optionally including itself, using the primitive send(). The messages are received via the recvmsg() primitive. A socket can only be member of one group at a time. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 5351b08..5f7b2c4 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -231,6 +231,20 @@ struct sockaddr_tipc { #define TIPC_SOCK_RECVQ_DEPTH 132 /* Default: none (read only) */ #define TIPC_MCAST_BROADCAST 133 /* Default: TIPC selects. No arg */ #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ +#define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ +#define TIPC_GROUP_LEAVE 136 /* No argument */ + +/* + * Flag values + */ +#define TIPC_GROUP_LOOPBACK 0x1 /* Receive copy of sent msg when match */ + +struct tipc_group_req { + __u32 type; /* group id */ + __u32 instance; /* member id */ + __u32 scope; /* zone/cluster/node */ + __u32 flags; +}; /* * Maximum sizes of TIPC bearer-related names (including terminating NULL) -- cgit v1.1 From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:25 +0200 Subject: tipc: receive group membership events via member socket Like with any other service, group members' availability can be subscribed for by connecting to be topology server. However, because the events arrive via a different socket than the member socket, there is a real risk that membership events my arrive out of synch with the actual JOIN/LEAVE action. I.e., it is possible to receive the first messages from a new member before the corresponding JOIN event arrives, just as it is possible to receive the last messages from a leaving member after the LEAVE event has already been received. Since each member socket is internally also subscribing for membership events, we now fix this problem by passing those events on to the user via the member socket. We leverage the already present member synch- ronization protocol to guarantee correct message/event order. An event is delivered to the user as an empty message where the two source addresses identify the new/lost member. Furthermore, we set the MSG_OOB bit in the message flags to mark it as an event. If the event is an indication about a member loss we also set the MSG_EOR bit, so it can be distinguished from a member addition event. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 5f7b2c4..ef41c11 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -238,6 +238,7 @@ struct sockaddr_tipc { * Flag values */ #define TIPC_GROUP_LOOPBACK 0x1 /* Receive copy of sent msg when match */ +#define TIPC_GROUP_MEMBER_EVTS 0x2 /* Receive membership events in socket */ struct tipc_group_req { __u32 type; /* group id */ -- cgit v1.1 From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Thu, 7 Sep 2017 04:00:06 -0700 Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio The offload types currently supported in mqprio are 0 (no offload) and 1 (offload only TCs) by setting these values for the 'hw' option. If offloads are supported by setting the 'hw' option to 1, the default offload mode is 'dcb' where only the TC values are offloaded to the device. This patch introduces a new hardware offload mode called 'channel' with 'hw' set to 1 in mqprio which makes full use of the mqprio options, the TCs, the queue configurations and the QoS parameters for the TCs. This is achieved through a new netlink attribute for the 'mode' option which takes values such as 'dcb' (default) and 'channel'. The 'channel' mode also supports QoS attributes for traffic class such as minimum and maximum values for bandwidth rate limits. This patch enables configuring additional HW shaper attributes associated with a traffic class. Currently the shaper for bandwidth rate limiting is supported which takes options such as minimum and maximum bandwidth rates and are offloaded to the hardware in the 'channel' mode. The min and max limits for bandwidth rates are provided by the user along with the TCs and the queue configurations when creating the mqprio qdisc. The interface can be extended to support new HW shapers in future through the 'shaper' attribute. Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading mqprio queue options and use this to be shared between the kernel and device driver. This contains a copy of the existing data structure for mqprio queue options. This new data structure can be extended when adding new attributes for traffic class such as mode, shaper, shaper parameters (bandwidth rate limits). The existing data structure for mqprio queue options will be shared between the kernel and userspace. Example: queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\ min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit To dump the bandwidth rates: qdisc mqprio 804a: root tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 queues:(0:3) (4:7) mode:channel shaper:bw_rlimit min_rate:1Gbit 2Gbit max_rate:4Gbit 5Gbit Signed-off-by: Amritha Nambiar Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/net/pkt_cls.h | 9 +++++++++ include/uapi/linux/pkt_sched.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index f526374..60d3978 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -546,6 +546,15 @@ struct tc_cls_bpf_offload { u32 gen_flags; }; +struct tc_mqprio_qopt_offload { + /* struct tc_mqprio_qopt must always be the first element */ + struct tc_mqprio_qopt qopt; + u16 mode; + u16 shaper; + u32 flags; + u64 min_rate[TC_QOPT_MAX_QUEUE]; + u64 max_rate[TC_QOPT_MAX_QUEUE]; +}; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 099bf55..e95b5c9 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -625,6 +625,22 @@ enum { #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; @@ -633,6 +649,22 @@ struct tc_mqprio_qopt { __u16 offset[TC_QOPT_MAX_QUEUE]; }; +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + /* SFB */ enum { -- cgit v1.1 From 17a9422de78c3a59b490b400f555635c477f1476 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Wed, 11 Oct 2017 14:49:43 -0700 Subject: i40e/i40evf: don't trust VF to reset itself When using 'ethtool -L' on a VF to change number of requested queues from PF, we shouldn't trust the VF to reset itself after making the request. Doing it that way opens the door for a potentially malicious VF to do nasty things to the PF which should never be the case. This makes it such that after VF makes a successful request, PF will then reset the VF to institute required changes. Only if the request fails will PF send a message back to VF letting it know the request was unsuccessful. Testing-hints: There should be no real functional changes. This is simply hardening against a potentially malicious VF. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 60e5d90..3ce6134 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -333,8 +333,8 @@ struct virtchnl_vsi_queue_config_info { * additional queues must be negotiated. This is a best effort request as it * is possible the PF does not have enough queues left to support the request. * If the PF cannot support the number requested it will respond with the - * maximum number it is able to support; otherwise it will respond with the - * number requested. + * maximum number it is able to support. If the request is successful, PF will + * then reset the VF to institute required changes. */ /* VF resource request */ -- cgit v1.1 From 7c39afb394c79e72c3795b4a42d55155b34ee073 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Tue, 15 Aug 2017 13:46:04 +0300 Subject: net/mlx5: PTP code migration to driver core section PTP code is moved to core section of mlx5 driver in order to share it between ethernet and infiniband. This movement involves the following changes: - Change mlx5e_ prefix to be mlx5_ - Add clock structs to Core - Add clock object to mlx5_core_dev - Call Init/Uninit clock from core init/cleanup - Rename mlx5e_tstamp to be mlx5_clock Signed-off-by: Feras Daoud Signed-off-by: Eitan Rabin Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 401c897..08c77b7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -49,6 +49,8 @@ #include #include #include +#include +#include enum { MLX5_BOARD_ID_LEN = 64, @@ -760,6 +762,27 @@ struct mlx5_rsvd_gids { struct ida ida; }; +#define MAX_PIN_NUM 8 +struct mlx5_pps { + u8 pin_caps[MAX_PIN_NUM]; + struct work_struct out_work; + u64 start[MAX_PIN_NUM]; + u8 enabled; +}; + +struct mlx5_clock { + rwlock_t lock; + struct cyclecounter cycles; + struct timecounter tc; + struct hwtstamp_config hwtstamp_config; + u32 nominal_c_mult; + unsigned long overflow_period; + struct delayed_work overflow_work; + struct ptp_clock *ptp; + struct ptp_clock_info ptp_info; + struct mlx5_pps pps_info; +}; + struct mlx5_core_dev { struct pci_dev *pdev; /* sync pci state */ @@ -800,6 +823,7 @@ struct mlx5_core_dev { #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap; #endif + struct mlx5_clock clock; }; struct mlx5_db { -- cgit v1.1 From 841f4f24053acad69240c6ab7427a1d24bc29491 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 13 Oct 2017 14:18:09 -0400 Subject: net: dsa: remove .set_addr Now that there is no user for the .set_addr function, remove it from DSA. If a switch supports this feature (like mv88e6xxx), the implementation can be done in the driver setup. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ce1d622..2746741 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -291,7 +291,6 @@ struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); int (*setup)(struct dsa_switch *ds); - int (*set_addr)(struct dsa_switch *ds, u8 *addr); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* -- cgit v1.1 From e086101b150ae8e99e54ab26101ef3835fa9f48d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Oct 2017 13:03:16 -0700 Subject: tcp: add a tracepoint for tcp retransmission We need a real-time notification for tcp retransmission for monitoring. Of course we could use ftrace to dynamically instrument this kernel function too, however we can't retrieve the connection information at the same time, for example perf-tools [1] reads /proc/net/tcp for socket details, which is slow when we have a lots of connections. Therefore, this patch adds a tracepoint for __tcp_retransmit_skb() and exposes src/dst IP addresses and ports of the connection. This also makes it easier to integrate into perf. Note, I expose both IPv4 and IPv6 addresses at the same time: for a IPv4 socket, v4 mapped address is used as IPv6 addresses, for a IPv6 socket, LOOPBACK4_IPV6 is already filled by kernel. Also, add sk and skb pointers as they are useful for BPF. 1. https://github.com/brendangregg/perf-tools/blob/master/net/tcpretrans Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Hannes Frederic Sowa Cc: Brendan Gregg Cc: Neal Cardwell Signed-off-by: Cong Wang Acked-by: Alexei Starovoitov Acked-by: Brendan Gregg Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 include/trace/events/tcp.h (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h new file mode 100644 index 0000000..3d1cbd0 --- /dev/null +++ b/include/trace/events/tcp.h @@ -0,0 +1,68 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tcp + +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TCP_H + +#include +#include +#include +#include + +TRACE_EVENT(tcp_retransmit_skb, + + TP_PROTO(struct sock *sk, struct sk_buff *skb), + + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + __field(void *, skbaddr) + __field(void *, skaddr) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skbaddr = skb; + __entry->skaddr = sk; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + + if (np) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = np->saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = *(np->daddr_cache); + } else { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6 daddrv6=%pI6", + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6) +); + +#endif /* _TRACE_TCP_H */ + +/* This part must be outside protection */ +#include -- cgit v1.1 From 32302902ff093891d8e64439cbb8ceae83e21ef8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 12 Oct 2017 11:38:45 -0700 Subject: mqprio: Reserve last 32 classid values for HW traffic classes and misc IDs This patch makes a slight tweak to mqprio in order to bring the classid values used back in line with what is used for mq. The general idea is to reserve values :ffe0 - :ffef to identify hardware traffic classes normally reported via dev->num_tc. By doing this we can maintain a consistent behavior with mq for classid where :1 - :ffdf will represent a physical qdisc mapped onto a Tx queue represented by classid - 1, and the traffic classes will be mapped onto a known subset of classid values reserved for our virtual qdiscs. Note I reserved the range from :fff0 - :ffff since this way we might be able to reuse these classid values with clsact and ingress which would mean that for mq, mqprio, ingress, and clsact we should be able to maintain a similar classid layout. Signed-off-by: Alexander Duyck Tested-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e95b5c9..e7cc3d3 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -74,6 +74,7 @@ struct tc_estimator { #define TC_H_INGRESS (0xFFFFFFF1U) #define TC_H_CLSACT TC_H_INGRESS +#define TC_H_MIN_PRIORITY 0xFFE0U #define TC_H_MIN_INGRESS 0xFFF2U #define TC_H_MIN_EGRESS 0xFFF3U -- cgit v1.1 From 69d78ef25c7b0058674145500efb12255738ba8a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:57 +0200 Subject: net: sched: store Qdisc pointer in struct block Prepare for removal of tp->q and store Qdisc pointer in the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 ++-- include/net/sch_generic.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 60d3978..e6c9e1e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -22,7 +22,7 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain); + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); void tcf_block_put(struct tcf_block *block); int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); @@ -30,7 +30,7 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, #else static inline int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain) + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) { return 0; } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 684d8ed..df4032c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -270,6 +270,7 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + struct Qdisc *q; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.1 From 855319becbcffec6988a4e781a861b69a71c5b58 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:58 +0200 Subject: net: sched: store net pointer in block and introduce qdisc_net helper Store net pointer in the block structure. Along the way, introduce qdisc_net helper which allows to easily obtain net pointer for qdisc instance. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 7 +++++++ include/net/sch_generic.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 259bc19..2d234af 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include #define DEFAULT_TX_QUEUE_LEN 1000 @@ -146,4 +148,9 @@ static inline bool is_classid_clsact_egress(u32 classid) TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS); } +static inline struct net *qdisc_net(struct Qdisc *q) +{ + return dev_net(q->dev_queue->dev); +} + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index df4032c..9b2cb91 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -270,6 +270,7 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + struct net *net; struct Qdisc *q; }; -- cgit v1.1 From 44186460c85a0121562db7cfef132d63c869118f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:59 +0200 Subject: net: sched: introduce tcf_block_q and tcf_block_dev helpers These helpers allows to get a q and netdev pointers for given block easily. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e6c9e1e..7bed674 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -24,6 +24,17 @@ void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); void tcf_block_put(struct tcf_block *block); + +static inline struct Qdisc *tcf_block_q(struct tcf_block *block) +{ + return block->q; +} + +static inline struct net_device *tcf_block_dev(struct tcf_block *block) +{ + return tcf_block_q(block)->dev_queue->dev; +} + int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); @@ -39,6 +50,16 @@ static inline void tcf_block_put(struct tcf_block *block) { } +static inline struct Qdisc *tcf_block_q(struct tcf_block *block) +{ + return NULL; +} + +static inline struct net_device *tcf_block_dev(struct tcf_block *block) +{ + return NULL; +} + static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { -- cgit v1.1 From 34e3759cf86a3e75463e34c1bb9691777406a175 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:00 +0200 Subject: net: sched: teach tcf_bind/unbind_filter to use block->q Whenever the block->q is set, it can be used instead of tp->q as it contains the same value. When it is not set, which can't happen now but it might happen with the follow-up shared blocks introduction, the class is not set in the result. That would lead to a class lookup instead of direct class pointer use for classful qdiscs. However, it is not planned to support classful qdisqs sharing filter blocks, so that may never happen. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 7bed674..49a143e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -74,36 +74,43 @@ __cls_set_class(unsigned long *clp, unsigned long cl) } static inline unsigned long -cls_set_class(struct tcf_proto *tp, unsigned long *clp, - unsigned long cl) +cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl) { unsigned long old_cl; - - tcf_tree_lock(tp); + + sch_tree_lock(q); old_cl = __cls_set_class(clp, cl); - tcf_tree_unlock(tp); - + sch_tree_unlock(q); return old_cl; } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { + struct Qdisc *q = tp->chain->block->q; unsigned long cl; - cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, r->classid); - cl = cls_set_class(tp, &r->class, cl); + /* Check q as it is not set for shared blocks. In that case, + * setting class is not supported. + */ + if (!q) + return; + cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); + cl = cls_set_class(q, &r->class, cl); if (cl) - tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { + struct Qdisc *q = tp->chain->block->q; unsigned long cl; + if (!q) + return; if ((cl = __cls_set_class(&r->class, 0)) != 0) - tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + q->ops->cl_ops->unbind_tcf(q, cl); } struct tcf_exts { -- cgit v1.1 From 74e3be6021d22df2ffcb691eae1affeb2bd0128e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:04 +0200 Subject: net: sched: use tcf_block_q helper to get q pointer for sch_tree_lock Use tcf_block_q helper to get q pointer to be used for direct call of sch_tree_lock/unlock instead of tcf_tree_lock/unlock. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9b2cb91..0aea9e2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -359,9 +359,6 @@ static inline void sch_tree_unlock(const struct Qdisc *q) spin_unlock_bh(qdisc_root_sleeping_lock(q)); } -#define tcf_tree_lock(tp) sch_tree_lock((tp)->q) -#define tcf_tree_unlock(tp) sch_tree_unlock((tp)->q) - extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; -- cgit v1.1 From 0da4af00b2ed3dbe46788623a696c4169447eadc Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 13 Oct 2017 15:08:07 -0700 Subject: ipv6: only update __use and lastusetime once per jiffy at most In order to not dirty the cacheline too often, we try to only update dst->__use and dst->lastusetime at most once per jiffy. As dst->lastusetime is only used by ipv6 garbage collector, it should be good enough time resolution. And __use is only used in ipv6_route_seq_show() to show how many times a dst has been used. And as __use is not atomic_t right now, it does not show the precise number of usage times anyway. So we think it should be OK to only update it at most once per jiffy. According to my latest syn flood test on a machine with intel Xeon 6th gen processor and 2 10G mlx nics bonded together, each with 8 rx queues on 2 NUMA nodes: With this patch, the packet process rate increases from ~3.49Mpps to ~3.75Mpps with a 7% increase rate. Note: dst_use() is being renamed to dst_hold_and_use() to better specify the purpose of the function. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 204c19e..5047e805 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -255,17 +255,18 @@ static inline void dst_hold(struct dst_entry *dst) WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); } -static inline void dst_use(struct dst_entry *dst, unsigned long time) +static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - dst_hold(dst); - dst->__use++; - dst->lastuse = time; + if (time != dst->lastuse) { + dst->__use++; + dst->lastuse = time; + } } -static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) +static inline void dst_hold_and_use(struct dst_entry *dst, unsigned long time) { - dst->__use++; - dst->lastuse = time; + dst_hold(dst); + dst_use_noref(dst, time); } static inline struct dst_entry *dst_clone(struct dst_entry *dst) -- cgit v1.1 From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:40:02 -0400 Subject: tracing: bpf: Hide bpf trace events when they are not used All the trace events defined in include/trace/events/bpf.h are only used when CONFIG_BPF_SYSCALL is defined. But this file gets included by include/linux/bpf_trace.h which is included by the networking code with CREATE_TRACE_POINTS defined. If a trace event is created but not used it still has data structures and functions created for its use, even though nothing is using them. To not waste space, do not define the BPF trace events in bpf.h unless CONFIG_BPF_SYSCALL is defined. Signed-off-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/trace/events/bpf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h index 52c8425..1fb58fa 100644 --- a/include/trace/events/bpf.h +++ b/include/trace/events/bpf.h @@ -4,6 +4,9 @@ #if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_H +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL + #include #include #include @@ -345,7 +348,7 @@ TRACE_EVENT(bpf_map_next_key, __print_hex(__get_dynamic_array(nxt), __entry->key_len), __entry->key_trunc ? " ..." : "") ); - +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _TRACE_BPF_H */ #include -- cgit v1.1 From 5a6cd6de76ae78b651e7c36eba8b1da465d65f06 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Thu, 5 Oct 2017 14:53:40 -0700 Subject: ethtool: add ethtool_intersect_link_masks This function provides a way to intersect two link masks together to find the common ground between them. For example in i40e, the driver first generates link masks for what is supported by the PHY type. The driver then gets the link masks for what the NVM supports. The resulting intersection between them yields what can truly be supported. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/ethtool.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4587a4c..c77fa35 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -163,6 +163,16 @@ extern int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); +/** + * ethtool_intersect_link_masks - Given two link masks, AND them together + * @dst: first mask and where result is stored + * @src: second mask to intersect with + * + * Given two link mode masks, AND them together and save the result in dst. + */ +void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, + struct ethtool_link_ksettings *src); + void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32); -- cgit v1.1 From a68f4a27f55f1d54e35c270aff89383da4b1b656 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:36:39 +0100 Subject: rxrpc: Support service upgrade from a kernel service Provide support for a kernel service to make use of the service upgrade facility. This involves: (1) Pass an upgrade request flag to rxrpc_kernel_begin_call(). (2) Make rxrpc_kernel_recv_data() return the call's current service ID so that the caller can detect service upgrade and see what the service was upgraded to. Signed-off-by: David Howells --- include/net/af_rxrpc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 3ac7915..820dd36 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -49,12 +49,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, unsigned long, s64, gfp_t, - rxrpc_notify_rx_t); + rxrpc_notify_rx_t, + bool); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, - void *, size_t, size_t *, bool, u32 *); + void *, size_t, size_t *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); -- cgit v1.1 From f4d15fb6f99af9b99f688bd87579137be44f85ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:07:31 +0100 Subject: rxrpc: Provide functions for allowing cleaner handling of signals Provide a couple of functions to allow cleaner handling of signals in a kernel service. They are: (1) rxrpc_kernel_get_rtt() This allows the kernel service to find out the RTT time for a call, so as to better judge how large a timeout to employ. Note, though, that whilst this returns a value in nanoseconds, the timeouts can only actually be in jiffies. (2) rxrpc_kernel_check_life() This returns a number that is updated when ACKs are received from the peer (notably including PING RESPONSE ACKs which we can elicit by sending PING ACKs to see if the call still exists on the server). The caller should compare the numbers of two calls to see if the call is still alive. These can be used to provide an extending timeout rather than returning immediately in the case that a signal occurs that would otherwise abort an RPC operation. The timeout would be extended if the server is still responsive and the call is still apparently alive on the server. For most operations this isn't that necessary - but for FS.StoreData it is: OpenAFS writes the data to storage as it comes in without making a backup, so if we immediately abort it when partially complete on a CTRL+C, say, we have no idea of the state of the file after the abort. Signed-off-by: David Howells --- include/net/af_rxrpc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 820dd36..2b3a6ee 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -61,6 +61,7 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); +u64 rxrpc_kernel_get_rtt(struct socket *, struct rxrpc_call *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); @@ -68,5 +69,6 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *, struct key *); int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, enum rxrpc_call_completion *, u32 *); +u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *); #endif /* _NET_RXRPC_H */ -- cgit v1.1 From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:28 +0200 Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP The 'cpumap' is primarily used as a backend map for XDP BPF helper call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. This patch implement the main part of the map. It is not connected to the XDP redirect system yet, and no SKB allocation are done yet. The main concern in this patch is to ensure the datapath can run without any locking. This adds complexity to the setup and tear-down procedure, which assumptions are extra carefully documented in the code comments. V2: - make sure array isn't larger than NR_CPUS - make sure CPUs added is a valid possible CPU V3: fix nitpicks from Jakub Kicinski V5: - Restrict map allocation to root / CAP_SYS_ADMIN - WARN_ON_ONCE if queue is not empty on tear-down - Return -EPERM on memlock limit instead of -ENOMEM - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup() - Moved cpu_map_enqueue() to next patch V6: all notice by Daniel Borkmann - Fix err return code in cpu_map_alloc() introduced in V5 - Move cpu_possible() check after max_entries boundary check - Forbid usage initially in check_map_func_compatibility() V7: - Fix alloc error path spotted by Daniel Borkmann - Did stress test adding+removing CPUs from the map concurrently - Fixed refcnt issue on cpu_map_entry, kthread started too soon - Make sure packets are flushed during tear-down, involved use of rcu_barrier() and kthread_run only exit after queue is empty - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring V8: - Nitpicking comments and gramma by Edward Cree - Fix missing semi-colon introduced in V7 due to rebasing - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 6f1a567..814c108 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #ifdef CONFIG_STREAM_PARSER BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6db9e1d..4303fb6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -112,6 +112,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, }; enum bpf_prog_type { -- cgit v1.1 From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:34 +0200 Subject: bpf: XDP_REDIRECT enable use of cpumap This patch connects cpumap to the xdp_do_redirect_map infrastructure. Still no SKB allocation are done yet. The XDP frames are transferred to the other CPU, but they are simply refcnt decremented on the remote CPU. This served as a good benchmark for measuring the overhead of remote refcnt decrement. If driver page recycle cache is not efficient then this, exposes a bottleneck in the page allocator. A shout-out to MST's ptr_ring, which is the secret behind is being so efficient to transfer memory pointers between CPUs, without constantly bouncing cache-lines between CPUs. V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot. V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet, as implementation require a separate upstream discussion. V5: - Fix a maybe-uninitialized pointed out by kbuild test robot. - Restrict bpf-prog side access to cpumap, open when use-cases appear - Implement cpu_map_enqueue() as a more simple void pointer enqueue V6: - Allow cpumap type for usage in helper bpf_redirect_map, general bpf-prog side restriction moved to earlier patch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/bpf.h | 31 ++++++++++++++++++++++++++++++- include/trace/events/xdp.h | 10 ++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4373125..6d4dd844 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -355,6 +355,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); +void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); +void __cpu_map_flush(struct bpf_map *map); +struct xdp_buff; +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx); + /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) { @@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) attr->numa_node : NUMA_NO_NODE; } -#else +#else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); @@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) static inline void __dev_map_flush(struct bpf_map *map) { } + +static inline +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + return NULL; +} + +static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index) +{ +} + +static inline void __cpu_map_flush(struct bpf_map *map) +{ +} + +struct xdp_buff; +static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, + struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return 0; +} #endif /* CONFIG_BPF_SYSCALL */ #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 4e16c43..eb2ece9 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, __entry->map_id, __entry->map_index) ); +#define devmap_ifindex(fwd, map) \ + (!fwd ? 0 : \ + (!map ? 0 : \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct net_device *)fwd)->ifindex : 0))) + #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ - trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0, \ + trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ 0, map, idx) #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ - trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0, \ + trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ err, map, idx) #endif /* _TRACE_XDP_H */ -- cgit v1.1 From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:39 +0200 Subject: bpf: cpumap xdp_buff to skb conversion and allocation This patch makes cpumap functional, by adding SKB allocation and invoking the network stack on the dequeuing CPU. For constructing the SKB on the remote CPU, the xdp_buff in converted into a struct xdp_pkt, and it mapped into the top headroom of the packet, to avoid allocating separate mem. For now, struct xdp_pkt is just a cpumap internal data structure, with info carried between enqueue to dequeue. If a driver doesn't have enough headroom it is simply dropped, with return code -EOVERFLOW. This will be picked up the xdp tracepoint infrastructure, to allow users to catch this. V2: take into account xdp->data_meta V4: - Drop busypoll tricks, keeping it more simple. - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei V5: correct RCU read protection around __netif_receive_skb_core. V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 31bb301..bf014af 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); +int netif_receive_skb_core(struct sk_buff *skb); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -- cgit v1.1 From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:44 +0200 Subject: bpf: cpumap add tracepoints This adds two tracepoint to the cpumap. One for the enqueue side trace_xdp_cpumap_enqueue() and one for the kthread dequeue side trace_xdp_cpumap_kthread(). To mitigate the tracepoint overhead, these are invoked during the enqueue/dequeue bulking phases, thus amortizing the cost. The obvious use-cases are for debugging and monitoring. The non-intuitive use-case is using these as a feedback loop to know the system load. One can imagine auto-scaling by reducing, adding or activating more worker CPUs on demand. V4: tracepoint remove time_limit info, instead add sched info V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/xdp.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'include') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index eb2ece9..0c8dec6 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -150,6 +150,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ err, map, idx) +TRACE_EVENT(xdp_cpumap_kthread, + + TP_PROTO(int map_id, unsigned int processed, unsigned int drops, + int sched), + + TP_ARGS(map_id, processed, drops, sched), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(int, cpu) + __field(unsigned int, drops) + __field(unsigned int, processed) + __field(int, sched) + ), + + TP_fast_assign( + __entry->map_id = map_id; + __entry->act = XDP_REDIRECT; + __entry->cpu = smp_processor_id(); + __entry->drops = drops; + __entry->processed = processed; + __entry->sched = sched; + ), + + TP_printk("kthread" + " cpu=%d map_id=%d action=%s" + " processed=%u drops=%u" + " sched=%d", + __entry->cpu, __entry->map_id, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->processed, __entry->drops, + __entry->sched) +); + +TRACE_EVENT(xdp_cpumap_enqueue, + + TP_PROTO(int map_id, unsigned int processed, unsigned int drops, + int to_cpu), + + TP_ARGS(map_id, processed, drops, to_cpu), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(int, cpu) + __field(unsigned int, drops) + __field(unsigned int, processed) + __field(int, to_cpu) + ), + + TP_fast_assign( + __entry->map_id = map_id; + __entry->act = XDP_REDIRECT; + __entry->cpu = smp_processor_id(); + __entry->drops = drops; + __entry->processed = processed; + __entry->to_cpu = to_cpu; + ), + + TP_printk("enqueue" + " cpu=%d map_id=%d action=%s" + " processed=%u drops=%u" + " to_cpu=%d", + __entry->cpu, __entry->map_id, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->processed, __entry->drops, + __entry->to_cpu) +); + #endif /* _TRACE_XDP_H */ #include -- cgit v1.1 From f8b8b1cd5aadd221742b45eb0ee3c8a80abf036a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:18 -0400 Subject: net: dsa: split dsa_port's netdev member The dsa_port structure has a "netdev" member, which can be used for either the master device, or the slave device, depending on its type. It is true that today, CPU port are not exposed to userspace, thus the port's netdev member can be used to point to its master interface. But it is still slightly confusing, so split it into more explicit "master" and "slave" members inside an anonymous union. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2746741..6ed1a17 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -164,6 +164,14 @@ struct dsa_mall_tc_entry { struct dsa_port { + /* A CPU port is physically connected to a master device. + * A user port exposed to userspace has a slave device. + */ + union { + struct net_device *master; + struct net_device *slave; + }; + /* CPU port tagging operations used by master or slave devices */ const struct dsa_device_ops *tag_ops; @@ -176,7 +184,6 @@ struct dsa_port { unsigned int index; const char *name; struct dsa_port *cpu_dp; - struct net_device *netdev; struct device_node *dn; unsigned int ageing_time; u8 stp_state; -- cgit v1.1 From c8652c83bc84ac8db44060ced0036de7628aa5e5 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:19 -0400 Subject: net: dsa: add dsa_to_port helper The dsa_port structure is part of DSA core data and must only be updated by the later. It is OK and sometimes necessary for the DSA drivers to access this data, but this has to be read only. For that purpose, add a dsa_to_port() helper which returns a const pointer to a dsa_port structure which must be used by DSA drivers from now on instead of digging into ds->ports[] themselves. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6ed1a17..38961ef 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -269,6 +269,11 @@ static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p); } +static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) +{ + return &ds->ports[p]; +} + static inline u8 dsa_upstream_port(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; -- cgit v1.1 From eb4ddaf474285a4c6986f4a1c3205bdb0bed2da9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:28:45 -0700 Subject: net/decnet: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Johannes Berg Cc: David Ahern Cc: linux-decnet-user@lists.sourceforge.net Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/dn.h | 7 ------- include/net/dn_nsp.h | 1 - 2 files changed, 8 deletions(-) (limited to 'include') diff --git a/include/net/dn.h b/include/net/dn.h index 913b73d..4394f7d 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -122,13 +122,6 @@ struct dn_scp /* Session Control Port */ unsigned long keepalive; void (*keepalive_fxn)(struct sock *sk); - /* - * This stuff is for the fast timer for delayed acks - */ - struct timer_list delack_timer; - int delack_pending; - void (*delack_fxn)(struct sock *sk); - }; static inline struct dn_scp *DN_SK(struct sock *sk) diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h index 3a3e33d..413a15e 100644 --- a/include/net/dn_nsp.h +++ b/include/net/dn_nsp.h @@ -17,7 +17,6 @@ void dn_nsp_send_data_ack(struct sock *sk); void dn_nsp_send_oth_ack(struct sock *sk); -void dn_nsp_delayed_ack(struct sock *sk); void dn_send_conn_ack(struct sock *sk); void dn_send_conn_conf(struct sock *sk, gfp_t gfp); void dn_nsp_send_disc(struct sock *sk, unsigned char type, -- cgit v1.1 From 59f379f9046a9e0532ffd19b44e3c32fe79ec51b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:19 -0700 Subject: inet/connection_sock: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Gerrit Renker Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: dccp@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 13e4c89..0358745 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -169,9 +169,9 @@ enum inet_csk_ack_state_t { }; void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)); + void (*retransmit_handler)(struct timer_list *), + void (*delack_handler)(struct timer_list *), + void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) -- cgit v1.1 From 78802011fbe34331bdef6f2dfb1634011f0e4c32 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:20 -0700 Subject: inet: frags: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Alexander Aring Cc: Stefan Schmidt Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: linux-wpan@vger.kernel.org Cc: netdev@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Kees Cook Acked-by: Stefan Schmidt # for ieee802154 Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index fc59e07..c695807 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -95,7 +95,7 @@ struct inet_frags { void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); - void (*frag_expire)(unsigned long data); + void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; }; -- cgit v1.1 From fb6ff75e18937a20dbec1eb47b5f893f38eabae4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 16 Oct 2017 14:24:02 -0700 Subject: tcp: Use pI6c in tcp tracepoint The compact form for IPv6 addresses is more user friendly than the full version. For example: compact: 2001:db8:1::1 full: 2001:0db8:0001:0000:0000:0000:0000:0004i Update the tcp tracepoint to show the compact form. Signed-off-by: David Ahern Acked-by: Cong Wang Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3d1cbd0..1ffab6d 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -57,7 +57,7 @@ TRACE_EVENT(tcp_retransmit_skb, } ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6 daddrv6=%pI6", + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); -- cgit v1.1 From 386fd5da401dc6c4b0ab6a54d333609876b699fe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 16 Oct 2017 15:32:07 -0700 Subject: tcp: Check daddr_cache before use in tracepoint Running perf in one window to capture tcp_retransmit_skb tracepoint: $ perf record -e tcp:tcp_retransmit_skb -a And causing a retransmission on an active TCP session (e.g., dropping packets in the receiver, changing MTU on the interface to 500 and back to 1500) triggers a panic: [ 58.543144] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 58.545300] IP: perf_trace_tcp_retransmit_skb+0xd0/0x145 [ 58.546770] PGD 0 P4D 0 [ 58.547472] Oops: 0000 [#1] SMP [ 58.548328] Modules linked in: vrf [ 58.549262] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc4+ #26 [ 58.551004] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 58.554560] task: ffffffff81a0e540 task.stack: ffffffff81a00000 [ 58.555817] RIP: 0010:perf_trace_tcp_retransmit_skb+0xd0/0x145 [ 58.557137] RSP: 0018:ffff88003fc03d68 EFLAGS: 00010282 [ 58.558292] RAX: 0000000000000000 RBX: ffffe8ffffc0ec80 RCX: ffff880038543098 [ 58.559850] RDX: 0400000000000000 RSI: ffff88003fc03d70 RDI: ffff88003fc14b68 [ 58.561099] RBP: ffff88003fc03da8 R08: 0000000000000000 R09: ffffea0000d3224a [ 58.562005] R10: ffff88003fc03db8 R11: 0000000000000010 R12: ffff8800385428c0 [ 58.562930] R13: ffffe8ffffc0e478 R14: ffffffff81a93a40 R15: ffff88003d4f0c00 [ 58.563845] FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 [ 58.564873] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 58.565613] CR2: 0000000000000008 CR3: 000000003d68f004 CR4: 00000000000606f0 [ 58.566538] Call Trace: [ 58.566865] [ 58.567140] __tcp_retransmit_skb+0x4ab/0x4c6 [ 58.567704] ? tcp_set_ca_state+0x22/0x3f [ 58.568231] tcp_retransmit_skb+0x14/0xa3 [ 58.568754] tcp_retransmit_timer+0x472/0x5e3 [ 58.569324] ? tcp_write_timer_handler+0x1e9/0x1e9 [ 58.569946] tcp_write_timer_handler+0x95/0x1e9 [ 58.570548] tcp_write_timer+0x2a/0x58 Check that daddr_cache is non-NULL before de-referencing. Fixes: e086101b150a ("tcp: add a tracepoint for tcp retransmission") Signed-off-by: David Ahern Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1ffab6d..f51c130 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -27,7 +27,6 @@ TRACE_EVENT(tcp_retransmit_skb, ), TP_fast_assign( - struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; @@ -44,11 +43,12 @@ TRACE_EVENT(tcp_retransmit_skb, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; - if (np) { + /* IPv6 socket ? */ + if (inet6_sk(sk)) { pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = np->saddr; + *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = *(np->daddr_cache); + *pin6 = sk->sk_v6_daddr; } else { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); -- cgit v1.1 From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:53 -0700 Subject: bpf: split verifier and program ops struct bpf_verifier_ops contains both verifier ops and operations used later during program's lifetime (test_run). Split the runtime ops into a different structure. BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops to the names. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 15 ++++++++++----- include/linux/bpf_types.h | 28 ++++++++++++++-------------- 2 files changed, 24 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6d4dd844..e1fba55 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -157,6 +157,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +struct bpf_prog_ops { + int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +}; + struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); @@ -172,8 +177,6 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); - int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); }; struct bpf_prog_aux { @@ -184,7 +187,8 @@ struct bpf_prog_aux { u32 id; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; - const struct bpf_verifier_ops *ops; + const struct bpf_prog_ops *ops; + const struct bpf_verifier_ops *vops; struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; @@ -279,8 +283,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); -#define BPF_PROG_TYPE(_id, _ops) \ - extern const struct bpf_verifier_ops _ops; +#define BPF_PROG_TYPE(_id, _name) \ + extern const struct bpf_prog_ops _name ## _prog_ops; \ + extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; #include diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 814c108..36418ad 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -1,22 +1,22 @@ /* internal file - do not include directly */ #ifdef CONFIG_NET -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) #endif #ifdef CONFIG_BPF_EVENTS -BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) -- cgit v1.1 From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:54 -0700 Subject: bpf: remove the verifier ops from program structure Since the verifier ops don't have to be associated with the program for its entire lifetime we can move it to verifier's struct bpf_verifier_env. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 - include/linux/bpf_verifier.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1fba55..cf91977 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -188,7 +188,6 @@ struct bpf_prog_aux { struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; - const struct bpf_verifier_ops *vops; struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f00ef75..feeaea9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -141,6 +141,7 @@ struct bpf_ext_analyzer_ops { */ struct bpf_verifier_env { struct bpf_prog *prog; /* eBPF program being verified */ + const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ -- cgit v1.1 From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:55 -0700 Subject: bpf: move knowledge about post-translation offsets out of verifier Use the fact that verifier ops are now separate from program ops to define a separate set of callbacks for verification of already translated programs. Since we expect the analyzer ops to be defined only for a small subset of all program types initialize their array by hand (don't use linux/bpf_types.h). Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf91977..d67ccdc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -291,6 +291,9 @@ DECLARE_PER_CPU(int, bpf_prog_active); #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; +extern const struct bpf_verifier_ops xdp_analyzer_ops; + struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); -- cgit v1.1 From 7a0947e755084b918e33242fd558e55cb443408e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 17 Oct 2017 17:16:52 -0700 Subject: dql: make dql_init return void dql_init always returned 0, and the only place that uses it in network core code didn't care about the return value anyway. Signed-off-by: Stephen Hemminger Acked-by: Hiroaki SHIMODA Signed-off-by: David S. Miller --- include/linux/dynamic_queue_limits.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index a4be703..f69f985 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -98,7 +98,7 @@ void dql_completed(struct dql *dql, unsigned int count); void dql_reset(struct dql *dql); /* Initialize dql state */ -int dql_init(struct dql *dql, unsigned hold_time); +void dql_init(struct dql *dql, unsigned int hold_time); #endif /* _KERNEL_ */ -- cgit v1.1 From 890056783c60ad9d0789774af2bc10fe4f27dd9d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 08:17:29 -0700 Subject: tcp: Remove use of inet6_sk and add IPv6 checks to tracepoint 386fd5da401d ("tcp: Check daddr_cache before use in tracepoint") was the second version of the tracepoint fixup patch. This patch is the delta between v2 and v3. Specifically, remove the use of inet6_sk and check sk_family as requested by Eric and add IS_ENABLED(CONFIG_IPV6) around the use of sk_v6_rcv_saddr and sk_v6_daddr as done in sock_common (noted by Cong). Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Tested-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index f51c130..c3220d9 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -43,13 +43,15 @@ TRACE_EVENT(tcp_retransmit_skb, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; - /* IPv6 socket ? */ - if (inet6_sk(sk)) { +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; - } else { + } else +#endif + { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; -- cgit v1.1 From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:54 -0700 Subject: net: Add extack to validator_info structs used for address notifier Add extack to in_validator_info and in6_validator_info. Update the one user of each, ipvlan, to return an error message for failures. Only manual configuration of an address is plumbed in the IPv6 code path. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/net/addrconf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 751d051..681dff3 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -154,6 +154,7 @@ struct in_ifaddr { struct in_validator_info { __be32 ivi_addr; struct in_device *ivi_dev; + struct netlink_ext_ack *extack; }; int register_inetaddr_notifier(struct notifier_block *nb); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 87981cd..b8b1643 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -55,6 +55,7 @@ struct prefix_info { struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; + struct netlink_ext_ack *extack; }; #define IN6_ADDR_HSIZE_SHIFT 4 -- cgit v1.1 From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Oct 2017 11:22:51 -0700 Subject: tcp: socket option to set TCP fast open key New socket option TCP_FASTOPEN_KEY to allow different keys per listener. The listener by default uses the global key until the socket option is set. The key is a 16 bytes long binary data. This option has no effect on regular non-listener TCP sockets. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 ++ include/net/tcp.h | 5 +++-- include/uapi/linux/tcp.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 23e2205..3470155 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -150,6 +150,8 @@ struct fastopen_queue { spinlock_t lock; int qlen; /* # of pending (TCP_SYN_RECV) reqs */ int max_qlen; /* != 0 iff TFO is currently enabled */ + + struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */ }; /** struct request_sock_queue - queue of request_socks diff --git a/include/net/tcp.h b/include/net/tcp.h index 3b3b9b9..1efe836 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1555,9 +1555,10 @@ struct tcp_fastopen_request { int copied; /* queued in tcp_connect() */ }; void tcp_free_fastopen_req(struct tcp_sock *tp); - +void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); -int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len); +int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, + void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 15c25ec..69c7493 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -119,6 +119,7 @@ enum { #define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ #define TCP_ULP 31 /* Attach a ULP to a TCP connection */ #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ +#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ struct tcp_repair_opt { __u32 opt_code; -- cgit v1.1 From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:22 -0700 Subject: bpf: Add file mode configuration into bpf maps Introduce the map read/write flags to the eBPF syscalls that returns the map fd. The flags is used to set up the file mode when construct a new file descriptor for bpf maps. To not break the backward capability, the f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise it should be O_RDONLY or O_WRONLY. When the userspace want to modify or read the map content, it will check the file mode to see if it is allowed to make the change. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 +++++--- include/uapi/linux/bpf.h | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d67ccdc..3e5508f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -315,11 +315,11 @@ void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; -int bpf_map_new_fd(struct bpf_map *map); +int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); -int bpf_obj_get_user(const char __user *pathname); +int bpf_obj_get_user(const char __user *pathname, int flags); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); @@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); +int bpf_get_file_flag(int flags); + /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent @@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) { } -static inline int bpf_obj_get_user(const char __user *pathname) +static inline int bpf_obj_get_user(const char __user *pathname, int flags) { return -EOPNOTSUPP; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4303fb6..d83f95e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,10 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U +/* Flags for accessing BPF object */ +#define BPF_F_RDONLY (1U << 3) +#define BPF_F_WRONLY (1U << 4) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -260,6 +264,7 @@ union bpf_attr { struct { /* anonymous struct used by BPF_OBJ_* commands */ __aligned_u64 pathname; __u32 bpf_fd; + __u32 file_flags; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ @@ -287,6 +292,7 @@ union bpf_attr { __u32 map_id; }; __u32 next_id; + __u32 open_flags; }; struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ -- cgit v1.1 From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:24 -0700 Subject: security: bpf: Add LSM hooks for bpf object related syscall Introduce several LSM hooks for the syscalls that will allow the userspace to access to eBPF object such as eBPF programs and eBPF maps. The security check is aimed to enforce a per object security protection for eBPF object so only processes with the right priviliges can read/write to a specific map or use a specific eBPF program. Besides that, a general security hook is added before the multiplexer of bpf syscall to check the cmd and the attribute used for the command. The actual security module can decide which command need to be checked and how the cmd should be checked. Signed-off-by: Chenbo Feng Acked-by: James Morris Signed-off-by: David S. Miller --- include/linux/bpf.h | 6 ++++++ include/linux/lsm_hooks.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/security.h | 45 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3e5508f..84c192d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -57,6 +57,9 @@ struct bpf_map { atomic_t usercnt; struct bpf_map *inner_map_meta; char name[BPF_OBJ_NAME_LEN]; +#ifdef CONFIG_SECURITY + void *security; +#endif }; /* function argument constraints */ @@ -193,6 +196,9 @@ struct bpf_prog_aux { struct user_struct *user; u64 load_time; /* ns since boottime */ char name[BPF_OBJ_NAME_LEN]; +#ifdef CONFIG_SECURITY + void *security; +#endif union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index c925812..7161d8e 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1351,6 +1351,40 @@ * @inode we wish to get the security context of. * @ctx is a pointer in which to place the allocated security context. * @ctxlen points to the place to put the length of @ctx. + * + * Security hooks for using the eBPF maps and programs functionalities through + * eBPF syscalls. + * + * @bpf: + * Do a initial check for all bpf syscalls after the attribute is copied + * into the kernel. The actual security module can implement their own + * rules to check the specific cmd they need. + * + * @bpf_map: + * Do a check when the kernel generate and return a file descriptor for + * eBPF maps. + * + * @map: bpf map that we want to access + * @mask: the access flags + * + * @bpf_prog: + * Do a check when the kernel generate and return a file descriptor for + * eBPF programs. + * + * @prog: bpf prog that userspace want to use. + * + * @bpf_map_alloc_security: + * Initialize the security field inside bpf map. + * + * @bpf_map_free_security: + * Clean up the security information stored inside bpf map. + * + * @bpf_prog_alloc_security: + * Initialize the security field inside bpf program. + * + * @bpf_prog_free_security: + * Clean up the security information stored inside bpf prog. + * */ union security_list_options { int (*binder_set_context_mgr)(struct task_struct *mgr); @@ -1682,6 +1716,17 @@ union security_list_options { struct audit_context *actx); void (*audit_rule_free)(void *lsmrule); #endif /* CONFIG_AUDIT */ + +#ifdef CONFIG_BPF_SYSCALL + int (*bpf)(int cmd, union bpf_attr *attr, + unsigned int size); + int (*bpf_map)(struct bpf_map *map, fmode_t fmode); + int (*bpf_prog)(struct bpf_prog *prog); + int (*bpf_map_alloc_security)(struct bpf_map *map); + void (*bpf_map_free_security)(struct bpf_map *map); + int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux); + void (*bpf_prog_free_security)(struct bpf_prog_aux *aux); +#endif /* CONFIG_BPF_SYSCALL */ }; struct security_hook_heads { @@ -1901,6 +1946,15 @@ struct security_hook_heads { struct list_head audit_rule_match; struct list_head audit_rule_free; #endif /* CONFIG_AUDIT */ +#ifdef CONFIG_BPF_SYSCALL + struct list_head bpf; + struct list_head bpf_map; + struct list_head bpf_prog; + struct list_head bpf_map_alloc_security; + struct list_head bpf_map_free_security; + struct list_head bpf_prog_alloc_security; + struct list_head bpf_prog_free_security; +#endif /* CONFIG_BPF_SYSCALL */ } __randomize_layout; /* diff --git a/include/linux/security.h b/include/linux/security.h index ce62659..18800b0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -31,6 +31,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -1730,6 +1731,50 @@ static inline void securityfs_remove(struct dentry *dentry) #endif +#ifdef CONFIG_BPF_SYSCALL +#ifdef CONFIG_SECURITY +extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); +extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); +extern int security_bpf_prog(struct bpf_prog *prog); +extern int security_bpf_map_alloc(struct bpf_map *map); +extern void security_bpf_map_free(struct bpf_map *map); +extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); +extern void security_bpf_prog_free(struct bpf_prog_aux *aux); +#else +static inline int security_bpf(int cmd, union bpf_attr *attr, + unsigned int size) +{ + return 0; +} + +static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode) +{ + return 0; +} + +static inline int security_bpf_prog(struct bpf_prog *prog) +{ + return 0; +} + +static inline int security_bpf_map_alloc(struct bpf_map *map) +{ + return 0; +} + +static inline void security_bpf_map_free(struct bpf_map *map) +{ } + +static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +{ + return 0; +} + +static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) +{ } +#endif /* CONFIG_SECURITY */ +#endif /* CONFIG_BPF_SYSCALL */ + #ifdef CONFIG_SECURITY static inline char *alloc_secdata(void) -- cgit v1.1 From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:26 -0700 Subject: selinux: bpf: Add addtional check for bpf object file receive Introduce a bpf object related check when sending and receiving files through unix domain socket as well as binder. It checks if the receiving process have privilege to read/write the bpf map or use the bpf program. This check is necessary because the bpf maps and programs are using a anonymous inode as their shared inode so the normal way of checking the files and sockets when passing between processes cannot work properly on eBPF object. This check only works when the BPF_SYSCALL is configured. Signed-off-by: Chenbo Feng Acked-by: Stephen Smalley Reviewed-by: James Morris Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 84c192d..1e334b2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -288,6 +288,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); +extern const struct file_operations bpf_map_fops; +extern const struct file_operations bpf_prog_fops; + #define BPF_PROG_TYPE(_id, _name) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; -- cgit v1.1 From b65f164d37cf6d4aac59b0e13c2e5c4cfe293fd2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 09:31:43 +0200 Subject: ipv6: let trace_fib6_table_lookup() dereference the fib table The perf traces for ipv6 routing code show a relevant cost around trace_fib6_table_lookup(), even if no trace is enabled. This is due to the fib6_table de-referencing currently performed by the caller. Let's the tracing code pay this overhead, passing to the trace helper the table pointer. This gives small but measurable performance improvement under UDP flood. Signed-off-by: Paolo Abeni Acked-by: Steven Rostedt (VMware) Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index d60096c..b34bed1 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -12,9 +12,9 @@ TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct rt6_info *rt, - u32 tb_id, const struct flowi6 *flp), + struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, rt, tb_id, flp), + TP_ARGS(net, rt, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -34,7 +34,7 @@ TRACE_EVENT(fib6_table_lookup, TP_fast_assign( struct in6_addr *in6; - __entry->tb_id = tb_id; + __entry->tb_id = table->tb6_id; __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); -- cgit v1.1 From 8c4083b30e56fc71b0e94c26374b32d95d5ea461 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:29 +0200 Subject: net: sched: add block bind/unbind notif. and extended block_get/put Introduce new type of ndo_setup_tc message to propage binding/unbinding of a block to driver. Call this ndo whenever qdisc gets/puts a block. Alongside with this, there's need to propagate binder type from qdisc code down to the notifier. So introduce extended variants of block_get/put in order to pass this info. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf014af..4de5b08 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -775,6 +775,7 @@ enum tc_setup_type { TC_SETUP_CLSFLOWER, TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, + TC_SETUP_BLOCK, }; /* These structures hold the attributes of xdp state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 49a143e..41bc7d7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -17,13 +17,27 @@ struct tcf_walker { int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +enum tcf_block_binder_type { + TCF_BLOCK_BINDER_TYPE_UNSPEC, +}; + +struct tcf_block_ext_info { + enum tcf_block_binder_type binder_type; +}; + #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); +int tcf_block_get_ext(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei); void tcf_block_put(struct tcf_block *block); +void tcf_block_put_ext(struct tcf_block *block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei); static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { @@ -46,10 +60,25 @@ int tcf_block_get(struct tcf_block **p_block, return 0; } +static inline +int tcf_block_get_ext(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + return 0; +} + static inline void tcf_block_put(struct tcf_block *block) { } +static inline +void tcf_block_put_ext(struct tcf_block *block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ +} + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; @@ -434,6 +463,17 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, void *type_data, bool err_stop); +enum tc_block_command { + TC_BLOCK_BIND, + TC_BLOCK_UNBIND, +}; + +struct tc_block_offload { + enum tc_block_command command; + enum tcf_block_binder_type binder_type; + struct tcf_block *block; +}; + struct tc_cls_common_offload { u32 chain_index; __be16 protocol; -- cgit v1.1 From 6e40cf2d4dee9dc22ff398041ce876bef8172dea Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:30 +0200 Subject: net: sched: use extended variants of block_get/put in ingress and clsact qdiscs Use previously introduced extended variants of block get and put functions. This allows to specify a binder types specific to clsact ingress/egress which is useful for drivers to distinguish who actually got the block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 41bc7d7..5c50af8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -19,6 +19,8 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); enum tcf_block_binder_type { TCF_BLOCK_BINDER_TYPE_UNSPEC, + TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS, + TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS, }; struct tcf_block_ext_info { -- cgit v1.1 From acb674428c3d57bccbe3f4a1a7a009f6d73e9f41 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:31 +0200 Subject: net: sched: introduce per-block callbacks Introduce infrastructure that allows drivers to register callbacks that are called whenever tc would offload inserted rule for a specific block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++ include/net/sch_generic.h | 1 + 2 files changed, 82 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5c50af8..4bc6b1c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -27,6 +27,8 @@ struct tcf_block_ext_info { enum tcf_block_binder_type binder_type; }; +struct tcf_block_cb; + #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); @@ -51,6 +53,21 @@ static inline struct net_device *tcf_block_dev(struct tcf_block *block) return tcf_block_q(block)->dev_queue->dev; } +void *tcf_block_cb_priv(struct tcf_block_cb *block_cb); +struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident); +void tcf_block_cb_incref(struct tcf_block_cb *block_cb); +unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb); +struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv); +int tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv); +void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); +void tcf_block_cb_unregister(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident); + int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); @@ -91,6 +108,70 @@ static inline struct net_device *tcf_block_dev(struct tcf_block *block) return NULL; } +static inline +int tc_setup_cb_block_register(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv) +{ + return 0; +} + +static inline +void tc_setup_cb_block_unregister(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv) +{ +} + +static inline +void *tcf_block_cb_priv(struct tcf_block_cb *block_cb) +{ + return NULL; +} + +static inline +struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ + return NULL; +} + +static inline +void tcf_block_cb_incref(struct tcf_block_cb *block_cb) +{ +} + +static inline +unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) +{ + return 0; +} + +static inline +struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + return NULL; +} + +static inline +int tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + return 0; +} + +static inline +void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +{ +} + +static inline +void tcf_block_cb_unregister(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ +} + static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0aea9e2..031dffd 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -272,6 +272,7 @@ struct tcf_block { struct list_head chain_list; struct net *net; struct Qdisc *q; + struct list_head cb_list; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.1 From 208c0f4b5237f1d6611b2c679a8022d6901577d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:32 +0200 Subject: net: sched: use tc_setup_cb_call to call per-block callbacks Extend the tc_setup_cb_call entrypoint function originally used only for action egress devices callbacks to call per-block callbacks as well. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4bc6b1c..fcca5a9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -543,8 +543,8 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ -int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, - void *type_data, bool err_stop); +int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, + enum tc_setup_type type, void *type_data, bool err_stop); enum tc_block_command { TC_BLOCK_BIND, -- cgit v1.1 From d58d31a118690b578897749feda48416ac10ca43 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:47 +0200 Subject: net: sched: remove unused classid field from tc_cls_common_offload It is no longer used by the drivers, so remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index fcca5a9..04caa24 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -561,7 +561,6 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; - u32 classid; }; static inline void @@ -571,7 +570,6 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; - cls_common->classid = tp->classid; } struct tc_cls_u32_knode { -- cgit v1.1 From fa71212e91811ac67014ad19d4fc3b3c3446ccf7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:48 +0200 Subject: net: sched: remove unused is_classid_clsact_ingress/egress helpers These helpers are no longer in use by drivers, so remove them. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2d234af..b8ecafc 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -135,19 +135,6 @@ static inline unsigned int psched_mtu(const struct net_device *dev) return dev->mtu + dev->hard_header_len; } -static inline bool is_classid_clsact_ingress(u32 classid) -{ - /* This also returns true for ingress qdisc */ - return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) && - TC_H_MIN(classid) != TC_H_MIN(TC_H_MIN_EGRESS); -} - -static inline bool is_classid_clsact_egress(u32 classid) -{ - return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) && - TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS); -} - static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); -- cgit v1.1 From ff61b5e3f041c2f1aa8d7c700af3007889973889 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 10:23:37 +0300 Subject: drivers, net, mlx4: convert mlx4_cq.refcount from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable mlx4_cq.refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index b0a57e0..daac2e3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -40,7 +40,7 @@ #include #include -#include +#include #include @@ -751,7 +751,7 @@ struct mlx4_cq { int cqn; unsigned vector; - atomic_t refcount; + refcount_t refcount; struct completion free; struct { struct list_head list; -- cgit v1.1 From 0068895ff845c38e9e2b65c002c53c623379e436 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 10:23:38 +0300 Subject: drivers, net, mlx4: convert mlx4_qp.refcount from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable mlx4_qp.refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index daac2e3..b8e19c4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -768,7 +768,7 @@ struct mlx4_qp { int qpn; - atomic_t refcount; + refcount_t refcount; struct completion free; u8 usage; }; -- cgit v1.1 From 17ac99b2b8d08ed40f4525491d6eff330329a6d2 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 10:23:39 +0300 Subject: drivers, net, mlx4: convert mlx4_srq.refcount from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable mlx4_srq.refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index b8e19c4..a9b5fed 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -781,7 +781,7 @@ struct mlx4_srq { int max_gs; int wqe_shift; - atomic_t refcount; + refcount_t refcount; struct completion free; }; -- cgit v1.1 From a4b51a9f83c6d359ff8fc0c66009283b6fdeeaf8 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 10:23:40 +0300 Subject: drivers, net, mlx5: convert mlx5_cq.refcount from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable mlx5_cq.refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: David S. Miller --- include/linux/mlx5/cq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 9589884..6a57ec2 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -35,7 +35,7 @@ #include #include - +#include struct mlx5_core_cq { u32 cqn; @@ -43,7 +43,7 @@ struct mlx5_core_cq { __be32 *set_ci_db; __be32 *arm_db; struct mlx5_uars_page *uar; - atomic_t refcount; + refcount_t refcount; struct completion free; unsigned vector; unsigned int irqn; -- cgit v1.1 From e65f7ee39b4d7604a78b03ed35d723e1001fc241 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 10:23:49 +0300 Subject: drivers, connector: convert cn_callback_entry.refcnt from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable cn_callback_entry.refcnt is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: David S. Miller --- include/linux/connector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/connector.h b/include/linux/connector.h index f8fe863..032102b 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -22,7 +22,7 @@ #define __CONNECTOR_H -#include +#include #include #include @@ -49,7 +49,7 @@ struct cn_callback_id { struct cn_callback_entry { struct list_head callback_entry; - atomic_t refcnt; + refcount_t refcnt; struct cn_queue_dev *pdev; struct cn_callback_id id; -- cgit v1.1 From e6546ef6d86d0fc38e0e84ccae80e641f3fc0087 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:39 -0700 Subject: bpf: add support for BPF_SOCK_OPS_BASE_RTT A congestion control algorithm can make a call to the BPF socket_ops program to request the base RTT. The base RTT can be congestion control dependent and is meant to represent a congestion threshold such that RTTs above it indicate congestion. This is especially useful for flows within a DC where the base RTT is easy to obtain. Being provided a base RTT solves a basic problem in RTT based congestion avoidance algorithms (such as Vegas, NV and BBR). Although it is easy to get the base RTT when the network is not congested, it is very diffcult to do when it is very congested. Newer connections get an inflated value of the base RTT leading to unfariness (newer flows with a larger base RTT get more bandwidth). As a result, RTT based congestion avoidance algorithms tend to update their base RTTs to improve fairness. In very congested networks this can lead to base RTT inflation, reducing the ability of these RTT based congestion control algorithms to prevent congestion. Note that in my experiments with TCP-NV, the base RTT provided can be much larger than the actual hardware RTT. For example, experimenting with hosts within a rack where the hardware RTT is 16-20us, I've used base RTTs up to 150us. The effect of using a larger base RTT is that the congestion avoidance algorithm will allow more queueing. When there are only a few flows the main effect is larger measured RTTs and RPC latencies due to the increased queueing. When there are a lot of flows, a larger base RTT can lead to more congestion and more packet drops. For this case, where the hardware RTT is 20us, a base RTT of 80us produces good results. This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this set adds support for using it in TCP-NV. Further study and testing is needed before support can be added to other delay based congestion avoidance algorithms. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83f95e..1aca744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -955,6 +955,13 @@ enum { BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control * needs ECN */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -- cgit v1.1 From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:40 -0700 Subject: bpf: Adding helper function bpf_getsockops Adding support for helper function bpf_getsockops to socket_ops BPF programs. This patch only supports TCP_CONGESTION. Signed-off-by: Vlad Vysotsky Acked-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1aca744..f650346 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -613,12 +613,22 @@ union bpf_attr { * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) * Calls setsockopt. Not all opts are available, only those with * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPROTO_TCP + * Supported levels: SOL_SOCKET and IPPROTO_TCP * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPROTO_TCP + * @level: SOL_SOCKET or IPPROTO_TCP * @optname: option name * @optval: pointer to option value - * @optlen: length of optval in byes + * @optlen: length of optval in bytes + * Return: 0 or negative error + * + * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) + * Calls getsockopt. Not all opts are available. + * Supported levels: IPPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: IPPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in bytes * Return: 0 or negative error * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) @@ -721,7 +731,8 @@ union bpf_attr { FN(sock_map_update), \ FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ - FN(perf_prog_read_value), + FN(perf_prog_read_value), \ + FN(getsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.1 From 40b16b9be5773a314948656c96adf7bf7cfdbd0b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 21 Oct 2017 11:45:46 +0200 Subject: batman-adv: use inline kernel-doc for uapi constants The enums of constants for netlink tends to become rather large over time. Documenting them is easier when the kernel-doc is actually next to constant and not in a different block above the enum. Also inline kernel-doc allows multi-paragraph description. This could be required to better document the netlink command types and the expected return values. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 369 +++++++++++++++++++++++++++++++--------- 1 file changed, 290 insertions(+), 79 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index a83ddb7..efd641c 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -24,20 +24,6 @@ /** * enum batadv_tt_client_flags - TT client specific flags - * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table - * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new - * update telling its new real location has not been received/sent yet - * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. - * This information is used by the "AP Isolation" feature - * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This - * information is used by the Extended Isolation feature - * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table - * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has - * not been announced yet - * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept - * in the table for one more originator interval for consistency purposes - * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of - * the network but no nnode has already announced it * * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. * Bits from 8 to 15 are called _local flags_ because they are used for local @@ -48,160 +34,385 @@ * in the TT CRC computation. */ enum batadv_tt_client_flags { + /** + * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table + */ BATADV_TT_CLIENT_DEL = (1 << 0), + + /** + * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and + * the new update telling its new real location has not been + * received/sent yet + */ BATADV_TT_CLIENT_ROAM = (1 << 1), + + /** + * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi + * interface. This information is used by the "AP Isolation" feature + */ BATADV_TT_CLIENT_WIFI = (1 << 4), + + /** + * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This + * information is used by the Extended Isolation feature + */ BATADV_TT_CLIENT_ISOLA = (1 << 5), + + /** + * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from + * the table + */ BATADV_TT_CLIENT_NOPURGE = (1 << 8), + + /** + * @BATADV_TT_CLIENT_NEW: this client has been added to the local table + * but has not been announced yet + */ BATADV_TT_CLIENT_NEW = (1 << 9), + + /** + * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it + * is kept in the table for one more originator interval for consistency + * purposes + */ BATADV_TT_CLIENT_PENDING = (1 << 10), + + /** + * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be + * part of the network but no nnode has already announced it + */ BATADV_TT_CLIENT_TEMP = (1 << 11), }; /** * enum batadv_nl_attrs - batman-adv netlink attributes - * - * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors - * @BATADV_ATTR_VERSION: batman-adv version string - * @BATADV_ATTR_ALGO_NAME: name of routing algorithm - * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface - * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface - * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface - * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface - * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface - * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv interface - * @BATADV_ATTR_ORIG_ADDRESS: originator mac address - * @BATADV_ATTR_TPMETER_RESULT: result of run (see batadv_tp_meter_status) - * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took - * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run - * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session - * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment - * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active - * @BATADV_ATTR_TT_ADDRESS: Client MAC address - * @BATADV_ATTR_TT_TTVN: Translation table version - * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version - * @BATADV_ATTR_TT_CRC32: CRC32 over translation table - * @BATADV_ATTR_TT_VID: VLAN ID - * @BATADV_ATTR_TT_FLAGS: Translation table client flags - * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best - * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen - * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address - * @BATADV_ATTR_TQ: TQ to neighbour - * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour - * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth - * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth - * @BATADV_ATTR_ROUTER: Gateway router MAC address - * @BATADV_ATTR_BLA_OWN: Flag indicating own originator - * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address - * @BATADV_ATTR_BLA_VID: BLA VLAN ID - * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address - * @BATADV_ATTR_BLA_CRC: BLA CRC - * @__BATADV_ATTR_AFTER_LAST: internal use - * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available - * @BATADV_ATTR_MAX: highest attribute number currently defined */ enum batadv_nl_attrs { + /** + * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors + */ BATADV_ATTR_UNSPEC, + + /** + * @BATADV_ATTR_VERSION: batman-adv version string + */ BATADV_ATTR_VERSION, + + /** + * @BATADV_ATTR_ALGO_NAME: name of routing algorithm + */ BATADV_ATTR_ALGO_NAME, + + /** + * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface + */ BATADV_ATTR_MESH_IFINDEX, + + /** + * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface + */ BATADV_ATTR_MESH_IFNAME, + + /** + * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface + */ BATADV_ATTR_MESH_ADDRESS, + + /** + * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface + */ BATADV_ATTR_HARD_IFINDEX, + + /** + * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface + */ BATADV_ATTR_HARD_IFNAME, + + /** + * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv + * interface + */ BATADV_ATTR_HARD_ADDRESS, + + /** + * @BATADV_ATTR_ORIG_ADDRESS: originator mac address + */ BATADV_ATTR_ORIG_ADDRESS, + + /** + * @BATADV_ATTR_TPMETER_RESULT: result of run (see + * batadv_tp_meter_status) + */ BATADV_ATTR_TPMETER_RESULT, + + /** + * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took + */ BATADV_ATTR_TPMETER_TEST_TIME, + + /** + * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run + */ BATADV_ATTR_TPMETER_BYTES, + + /** + * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session + */ BATADV_ATTR_TPMETER_COOKIE, + + /** + * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment + */ BATADV_ATTR_PAD, + + /** + * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active + */ BATADV_ATTR_ACTIVE, + + /** + * @BATADV_ATTR_TT_ADDRESS: Client MAC address + */ BATADV_ATTR_TT_ADDRESS, + + /** + * @BATADV_ATTR_TT_TTVN: Translation table version + */ BATADV_ATTR_TT_TTVN, + + /** + * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version + */ BATADV_ATTR_TT_LAST_TTVN, + + /** + * @BATADV_ATTR_TT_CRC32: CRC32 over translation table + */ BATADV_ATTR_TT_CRC32, + + /** + * @BATADV_ATTR_TT_VID: VLAN ID + */ BATADV_ATTR_TT_VID, + + /** + * @BATADV_ATTR_TT_FLAGS: Translation table client flags + */ BATADV_ATTR_TT_FLAGS, + + /** + * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best + */ BATADV_ATTR_FLAG_BEST, + + /** + * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen + */ BATADV_ATTR_LAST_SEEN_MSECS, + + /** + * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address + */ BATADV_ATTR_NEIGH_ADDRESS, + + /** + * @BATADV_ATTR_TQ: TQ to neighbour + */ BATADV_ATTR_TQ, + + /** + * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour + */ BATADV_ATTR_THROUGHPUT, + + /** + * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth + */ BATADV_ATTR_BANDWIDTH_UP, + + /** + * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth + */ BATADV_ATTR_BANDWIDTH_DOWN, + + /** + * @BATADV_ATTR_ROUTER: Gateway router MAC address + */ BATADV_ATTR_ROUTER, + + /** + * @BATADV_ATTR_BLA_OWN: Flag indicating own originator + */ BATADV_ATTR_BLA_OWN, + + /** + * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address + */ BATADV_ATTR_BLA_ADDRESS, + + /** + * @BATADV_ATTR_BLA_VID: BLA VLAN ID + */ BATADV_ATTR_BLA_VID, + + /** + * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address + */ BATADV_ATTR_BLA_BACKBONE, + + /** + * @BATADV_ATTR_BLA_CRC: BLA CRC + */ BATADV_ATTR_BLA_CRC, + /* add attributes above here, update the policy in netlink.c */ + + /** + * @__BATADV_ATTR_AFTER_LAST: internal use + */ __BATADV_ATTR_AFTER_LAST, + + /** + * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available + */ NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, + + /** + * @BATADV_ATTR_MAX: highest attribute number currently defined + */ BATADV_ATTR_MAX = __BATADV_ATTR_AFTER_LAST - 1 }; /** * enum batadv_nl_commands - supported batman-adv netlink commands - * - * @BATADV_CMD_UNSPEC: unspecified command to catch errors - * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device - * @BATADV_CMD_TP_METER: Start a tp meter session - * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session - * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. - * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces - * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations - * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations - * @BATADV_CMD_GET_ORIGINATORS: Query list of originators - * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours - * @BATADV_CMD_GET_GATEWAYS: Query list of gateways - * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims - * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones - * @__BATADV_CMD_AFTER_LAST: internal use - * @BATADV_CMD_MAX: highest used command number */ enum batadv_nl_commands { + /** + * @BATADV_CMD_UNSPEC: unspecified command to catch errors + */ BATADV_CMD_UNSPEC, + + /** + * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv + * device + */ BATADV_CMD_GET_MESH_INFO, + + /** + * @BATADV_CMD_TP_METER: Start a tp meter session + */ BATADV_CMD_TP_METER, + + /** + * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session + */ BATADV_CMD_TP_METER_CANCEL, + + /** + * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. + */ BATADV_CMD_GET_ROUTING_ALGOS, + + /** + * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces + */ BATADV_CMD_GET_HARDIFS, + + /** + * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations + */ BATADV_CMD_GET_TRANSTABLE_LOCAL, + + /** + * @BATADV_CMD_GET_TRANSTABLE_GLOBAL: Query list of global translations + */ BATADV_CMD_GET_TRANSTABLE_GLOBAL, + + /** + * @BATADV_CMD_GET_ORIGINATORS: Query list of originators + */ BATADV_CMD_GET_ORIGINATORS, + + /** + * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours + */ BATADV_CMD_GET_NEIGHBORS, + + /** + * @BATADV_CMD_GET_GATEWAYS: Query list of gateways + */ BATADV_CMD_GET_GATEWAYS, + + /** + * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims + */ BATADV_CMD_GET_BLA_CLAIM, + + /** + * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance + * backbones + */ BATADV_CMD_GET_BLA_BACKBONE, + /* add new commands above here */ + + /** + * @__BATADV_CMD_AFTER_LAST: internal use + */ __BATADV_CMD_AFTER_LAST, + + /** + * @BATADV_CMD_MAX: highest used command number + */ BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 }; /** * enum batadv_tp_meter_reason - reason of a tp meter test run stop - * @BATADV_TP_REASON_COMPLETE: sender finished tp run - * @BATADV_TP_REASON_CANCEL: sender was stopped during run - * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or didn't - * answer - * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit - * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node already - * ongoing - * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory - * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface - * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions */ enum batadv_tp_meter_reason { + /** + * @BATADV_TP_REASON_COMPLETE: sender finished tp run + */ BATADV_TP_REASON_COMPLETE = 3, + + /** + * @BATADV_TP_REASON_CANCEL: sender was stopped during run + */ BATADV_TP_REASON_CANCEL = 4, + /* error status >= 128 */ + + /** + * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or + * didn't answer + */ BATADV_TP_REASON_DST_UNREACHABLE = 128, + + /** + * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit + */ BATADV_TP_REASON_RESEND_LIMIT = 129, + + /** + * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node + * already ongoing + */ BATADV_TP_REASON_ALREADY_ONGOING = 130, + + /** + * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory + */ BATADV_TP_REASON_MEMORY_ERROR = 131, + + /** + * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface + */ BATADV_TP_REASON_CANT_SEND = 132, + + /** + * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions + */ BATADV_TP_REASON_TOO_MANY = 133, }; -- cgit v1.1 From f6e37b25413cf636369668652e9752ee77c7d9f7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:22 -0700 Subject: tcp: add trace event class tcp_event_sk_skb Introduce event class tcp_event_sk_skb for tcp tracepoints that have arguments sk and skb. Existing tracepoint trace_tcp_retransmit_skb() falls into this class. This patch rewrites the definition of trace_tcp_retransmit_skb() with tcp_event_sk_skb. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index c3220d9..14b0a708 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,7 +9,13 @@ #include #include -TRACE_EVENT(tcp_retransmit_skb, +/* + * tcp event with arguments sk and skb + * + * Note: this class requires a valid sk pointer; while skb pointer could + * be NULL. + */ +DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(struct sock *sk, struct sk_buff *skb), @@ -64,6 +70,13 @@ TRACE_EVENT(tcp_retransmit_skb, __entry->saddr_v6, __entry->daddr_v6) ); +DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, + + TP_PROTO(struct sock *sk, struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.1 From 7344e29f285a94b965075599731811c352f3ab40 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:23 -0700 Subject: tcp: mark trace event arguments sk and skb as const Some functions that we plan to add trace points require const sk and/or skb. So we mark these fields as const in the tracepoint. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 14b0a708..2b6fe72 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -17,13 +17,13 @@ */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( - __field(void *, skbaddr) - __field(void *, skaddr) + __field(const void *, skbaddr) + __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __array(__u8, saddr, 4) @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); -- cgit v1.1 From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:24 -0700 Subject: tcp: add tracepoint trace_tcp_send_reset New tracepoint trace_tcp_send_reset is added and called from tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 2b6fe72..3e57e1a 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -77,6 +77,17 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_ARGS(sk, skb) ); +/* + * skb of trace_tcp_send_reset is the skb that caused RST. In case of + * active reset, skb should be NULL + */ +DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.1 From 5941521c05d69cf3f2b1293eefd21207e083b70f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:25 -0700 Subject: tcp: add tracepoint trace_tcp_receive_reset New tracepoint trace_tcp_receive_reset is added and called from tcp_reset(). This tracepoint is define with a new class tcp_event_sk. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3e57e1a..c83c711 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -88,6 +88,72 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_ARGS(sk, skb) ); +/* + * tcp event with arguments sk + * + * Note: this class requires a valid sk pointer. + */ +DECLARE_EVENT_CLASS(tcp_event_sk, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = sk->sk_v6_rcv_saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = sk->sk_v6_daddr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6) +); + +DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.1 From e1a4aa50f47303ebb3ca0cfd01687884551ce03d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:26 -0700 Subject: tcp: add tracepoint trace_tcp_destroy_sock This patch adds trace event trace_tcp_destroy_sock. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index c83c711..1724c12 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -154,6 +154,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_ARGS(sk) ); +DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.1 From e8fce23946b7e7eadf25ad78d8207c22903dfe27 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:27 -0700 Subject: tcp: add tracepoint trace_tcp_set_state() This patch adds tracepoint trace_tcp_set_state. Besides usual fields (s/d ports, IP addresses), old and new state of the socket is also printed with TP_printk, with __print_symbolic(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1724c12..03699ba 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,6 +9,22 @@ #include #include +#define tcp_state_name(state) { state, #state } +#define show_tcp_state_name(val) \ + __print_symbolic(val, \ + tcp_state_name(TCP_ESTABLISHED), \ + tcp_state_name(TCP_SYN_SENT), \ + tcp_state_name(TCP_SYN_RECV), \ + tcp_state_name(TCP_FIN_WAIT1), \ + tcp_state_name(TCP_FIN_WAIT2), \ + tcp_state_name(TCP_TIME_WAIT), \ + tcp_state_name(TCP_CLOSE), \ + tcp_state_name(TCP_CLOSE_WAIT), \ + tcp_state_name(TCP_LAST_ACK), \ + tcp_state_name(TCP_LISTEN), \ + tcp_state_name(TCP_CLOSING), \ + tcp_state_name(TCP_NEW_SYN_RECV)) + /* * tcp event with arguments sk and skb * @@ -161,6 +177,66 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_ARGS(sk) ); +TRACE_EVENT(tcp_set_state, + + TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), + + TP_ARGS(sk, oldstate, newstate), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(int, oldstate) + __field(int, newstate) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + __entry->oldstate = oldstate; + __entry->newstate = newstate; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = sk->sk_v6_rcv_saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = sk->sk_v6_daddr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + show_tcp_state_name(__entry->oldstate), + show_tcp_state_name(__entry->newstate)) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.1 From 3f27fb23219e75343b094366f2358bff34300493 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:47 -0700 Subject: ipv6: addrconf: add per netns perturbation in inet6_addr_hash() Bring IPv6 in par with IPv4 : - Use net_hash_mix() to spread addresses a bit more. - Use 256 slots hash table instead of 16 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index b8b1643..15b5ffd 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -58,7 +58,7 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; -#define IN6_ADDR_HSIZE_SHIFT 4 +#define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) int addrconf_init(void); -- cgit v1.1 From b6f4f8484d88b69f700907200a9a9ec73806355f Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Mon, 23 Oct 2017 15:35:58 -0400 Subject: net/sock: Update sk rcu iterator macro. Mark hlist node in sk rcu iterator as protected by the rcu. hlist_next_rcu accomplishes this and silences the warnings sparse throws. Found with make C=1 net/ipv4/udp.o on linux-next tag next-20171009. Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 4827094..6f1be97 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -737,10 +737,10 @@ static inline void sk_add_bind_node(struct sock *sk, * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ - for (pos = rcu_dereference((head)->first); \ + for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ - pos = rcu_dereference(pos->next)) + pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(struct sock *sk) { -- cgit v1.1 From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 23 Oct 2017 13:22:23 -0700 Subject: tcp: Configure TFO without cookie per socket and/or per route We already allow to enable TFO without a cookie by using the fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or TFO_CLIENT_NO_COOKIE). This is safe to do in certain environments where we know that there isn't a malicous host (aka., data-centers) or when the application-protocol already provides an authentication mechanism in the first flight of data. A server however might be providing multiple services or talking to both sides (public Internet and data-center). So, this server would want to enable cookie-less TFO for certain services and/or for connections that go to the data-center. This patch exposes a socket-option and a per-route attribute to enable such fine-grained configurations. Signed-off-by: Christoph Paasch Reviewed-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/net/tcp.h | 3 ++- include/uapi/linux/rtnetlink.h | 2 ++ include/uapi/linux/tcp.h | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1d2c44e..173a7c2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -215,7 +215,8 @@ struct tcp_sock { u8 chrono_type:2, /* current chronograph type */ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ - unused:4; + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ + unused:3; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ unused1 : 1, diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c13484..2392f74 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1567,7 +1567,8 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index dab7dad..fe667926 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -430,6 +430,8 @@ enum { #define RTAX_QUICKACK RTAX_QUICKACK RTAX_CC_ALGO, #define RTAX_CC_ALGO RTAX_CC_ALGO + RTAX_FASTOPEN_NO_COOKIE, +#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE __RTAX_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 69c7493..d67e1d4 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -120,6 +120,7 @@ enum { #define TCP_ULP 31 /* Attach a ULP to a TCP connection */ #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ +#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ struct tcp_repair_opt { __u32 opt_code; -- cgit v1.1 From c4f3db15958277c03d1c324894255ea3ecbf86e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:40 +0200 Subject: netfilter: conntrack: add and use nf_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. As a first step, add and use a new log function for this, similar to nf_ct_helper_log(). Add __cold annotation -- invalid packets should be infrequent so gcc can consider all call paths that lead to such a function as unlikely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 738a030..6d79a06 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -152,8 +152,18 @@ extern const struct nla_policy nf_ct_port_nla_policy[]; #define LOG_INVALID(net, proto) \ ((net)->ct.sysctl_log_invalid == (proto) || \ (net)->ct.sysctl_log_invalid == IPPROTO_RAW) + +__printf(5, 6) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, + struct net *net, + u16 pf, u8 protonum, + const char *fmt, ...); #else static inline int LOG_INVALID(struct net *net, int proto) { return 0; } + +static inline __printf(5, 6) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, + u16 pf, u8 protonum, const char *fmt, ...) {} #endif /* CONFIG_SYSCTL */ #endif /*_NF_CONNTRACK_PROTOCOL_H*/ -- cgit v1.1 From 3d0b527bc9dc0e8c4428eb1a98d4cd27bd1114c7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:41 +0200 Subject: netfilter: conntrack: add and use nf_ct_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. Add a wrapper for the previous patch that extracs the information from nf_conn and passes it to nf_l4proto_log_invalid(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6d79a06..5d51255 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -149,21 +149,23 @@ int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -#define LOG_INVALID(net, proto) \ - ((net)->ct.sysctl_log_invalid == (proto) || \ - (net)->ct.sysctl_log_invalid == IPPROTO_RAW) - +__printf(3, 4) __cold +void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const char *fmt, ...); __printf(5, 6) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, u16 pf, u8 protonum, const char *fmt, ...); #else -static inline int LOG_INVALID(struct net *net, int proto) { return 0; } - static inline __printf(5, 6) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, u16 pf, u8 protonum, const char *fmt, ...) {} +static inline __printf(3, 4) __cold +void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #endif /*_NF_CONNTRACK_PROTOCOL_H*/ -- cgit v1.1 From eb6fad5a4a328b85d3faa8b301b522e3f316b49d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:42 +0200 Subject: netfilter: conntrack: remove pf argument from l4 packet functions not needed/used anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 5d51255..e065188 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -42,7 +42,6 @@ struct nf_conntrack_l4proto { const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts); /* Called when a new connection for this protocol found; -- cgit v1.1 From 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 09:38:30 +0200 Subject: netfilter: conntrack: make l3proto trackers const previous patches removed all writes to them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 2 +- include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 919e4e8..5534ecc 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -10,7 +10,7 @@ #define _NF_CONNTRACK_IPV4_H -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index eaea968..30dc579 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -1,7 +1,7 @@ #ifndef _NF_CONNTRACK_IPV6_H #define _NF_CONNTRACK_IPV6_H -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; -- cgit v1.1 From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sat, 21 Oct 2017 00:25:15 +0300 Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote address (outer v6 destination) is one of host's locally configured addresses. Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address (outer v6 source) is a local address. This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such configurations. An example where this proves useful is a system where entities are identified by their unique v6 addresses, and use tunnels to encapsulate traffic between them. The limitation prevents placing several entities on same host. Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this restriction. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/uapi/linux/ip6_tunnel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 425926c..ffebbe3 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -20,6 +20,8 @@ #define IP6_TNL_F_RCV_DSCP_COPY 0x10 /* copy fwmark from inner packet */ #define IP6_TNL_F_USE_ORIG_FWMARK 0x20 +/* allow remote endpoint on the local node */ +#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40 struct ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ -- cgit v1.1 From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:08 -0700 Subject: bpf: permit multiple bpf attachments for a single perf event This patch enables multiple bpf attachments for a kprobe/uprobe/tracepoint single trace event. Each trace_event keeps a list of attached perf events. When an event happens, all attached bpf programs will be executed based on the order of attachment. A global bpf_event_mutex lock is introduced to protect prog_array attaching and detaching. An alternative will be introduce a mutex lock in every trace_event_call structure, but it takes a lot of extra memory. So a global bpf_event_mutex lock is a good compromise. The bpf prog detachment involves allocation of memory. If the allocation fails, a dummy do-nothing program will replace to-be-detached program in-place. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/bpf.h | 30 +++++++++++++++++++++++++----- include/linux/trace_events.h | 43 +++++++++++++++++++++++++++++++++++++++---- include/trace/perf.h | 6 +++--- 3 files changed, 67 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e334b2..172be7f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,18 +273,38 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt); -#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, + struct bpf_prog *old_prog); +int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, + struct bpf_prog *exclude_prog, + struct bpf_prog *include_prog, + struct bpf_prog_array **new_array); + +#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog **_prog; \ + struct bpf_prog **_prog, *__prog; \ + struct bpf_prog_array *_array; \ u32 _ret = 1; \ rcu_read_lock(); \ - _prog = rcu_dereference(array)->progs; \ - for (; *_prog; _prog++) \ - _ret &= func(*_prog, ctx); \ + _array = rcu_dereference(array); \ + if (unlikely(check_non_null && !_array))\ + goto _out; \ + _prog = _array->progs; \ + while ((__prog = READ_ONCE(*_prog))) { \ + _ret &= func(__prog, ctx); \ + _prog++; \ + } \ +_out: \ rcu_read_unlock(); \ _ret; \ }) +#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ + __BPF_PROG_RUN_ARRAY(array, ctx, func, false) + +#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ + __BPF_PROG_RUN_ARRAY(array, ctx, func, true) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2e0f222..fc6aeca 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -271,14 +271,37 @@ struct trace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; - struct bpf_prog *prog; - struct perf_event *bpf_prog_owner; + struct bpf_prog_array __rcu *prog_array; int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; +#ifdef CONFIG_PERF_EVENTS +static inline bool bpf_prog_array_valid(struct trace_event_call *call) +{ + /* + * This inline function checks whether call->prog_array + * is valid or not. The function is called in various places, + * outside rcu_read_lock/unlock, as a heuristic to speed up execution. + * + * If this function returns true, and later call->prog_array + * becomes false inside rcu_read_lock/unlock region, + * we bail out then. If this function return false, + * there is a risk that we might miss a few events if the checking + * were delayed until inside rcu_read_lock/unlock region and + * call->prog_array happened to become non-NULL then. + * + * Here, READ_ONCE() is used instead of rcu_access_pointer(). + * rcu_access_pointer() requires the actual definition of + * "struct bpf_prog_array" while READ_ONCE() only needs + * a declaration of the same type. + */ + return !!READ_ONCE(call->prog_array); +} +#endif + static inline const char * trace_event_name(struct trace_event_call *call) { @@ -435,12 +458,23 @@ trace_trigger_soft_disabled(struct trace_event_file *file) } #ifdef CONFIG_BPF_EVENTS -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); +int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +void perf_event_detach_bpf_prog(struct perf_event *event); #else -static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { return 1; } + +static inline int +perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } + #endif enum { @@ -511,6 +545,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, { perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event); } + #endif #endif /* _LINUX_TRACE_EVENT_H */ diff --git a/include/trace/perf.h b/include/trace/perf.h index 04fe68bb..14f127b6 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto) \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ struct trace_event_raw_##call *entry; \ - struct bpf_prog *prog = event_call->prog; \ struct pt_regs *__regs; \ u64 __count = 1; \ struct task_struct *__task = NULL; \ @@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto) \ __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ \ head = this_cpu_ptr(event_call->perf_events); \ - if (!prog && __builtin_constant_p(!__task) && !__task && \ - hlist_empty(head)) \ + if (!bpf_prog_array_valid(event_call) && \ + __builtin_constant_p(!__task) && !__task && \ + hlist_empty(head)) \ return; \ \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ -- cgit v1.1 From ef5201c83d1400570a3b6f004ad7a23d71934411 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Oct 2017 13:54:20 +0800 Subject: bonding: remove rtmsg_ifinfo called after bond_lower_state_changed After the patch 'rtnetlink: bring NETDEV_CHANGELOWERSTATE event process back to rtnetlink_event', bond_lower_state_changed would generate NETDEV_CHANGEUPPER event which would send a notification to userspace in rtnetlink_event. There's no need to call rtmsg_ifinfo to send the notification any more. So this patch is to remove it from these places after bond_lower_state_changed. Besides, after this, rtmsg_ifinfo is not needed to be exported. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/bonding.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 2860cc6..f801fc9 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -330,7 +330,6 @@ static inline void bond_set_active_slave(struct slave *slave) slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); - rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -340,7 +339,6 @@ static inline void bond_set_backup_slave(struct slave *slave) slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); - rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -353,7 +351,6 @@ static inline void bond_set_slave_state(struct slave *slave, slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); - rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); bond_queue_slave_event(slave); slave->should_notify = 0; } else { @@ -385,7 +382,6 @@ static inline void bond_slave_state_notify(struct bonding *bond) bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); - rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_ATOMIC); tmp->should_notify = 0; } } -- cgit v1.1 From 9c3b57518363577d4e2ea1964ef4fa03e100beaa Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:31 -0700 Subject: net: sctp: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Vlad Yasevich Cc: Neil Horman Cc: "David S. Miller" Cc: linux-sctp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2db3d3a..13cc496 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -72,7 +72,7 @@ typedef enum sctp_disposition (sctp_state_fn_t) ( const union sctp_subtype type, void *arg, struct sctp_cmd_seq *commands); -typedef void (sctp_timer_event_t) (unsigned long); +typedef void (sctp_timer_event_t) (struct timer_list *); struct sctp_sm_table_entry { sctp_state_fn_t *fn; const char *name; @@ -314,10 +314,10 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type, void *event_arg, gfp_t gfp); /* 2nd level prototypes */ -void sctp_generate_t3_rtx_event(unsigned long peer); -void sctp_generate_heartbeat_event(unsigned long peer); -void sctp_generate_reconf_event(unsigned long peer); -void sctp_generate_proto_unreach_event(unsigned long peer); +void sctp_generate_t3_rtx_event(struct timer_list *t); +void sctp_generate_heartbeat_event(struct timer_list *t); +void sctp_generate_reconf_event(struct timer_list *t); +void sctp_generate_proto_unreach_event(struct timer_list *t); void sctp_ootb_pkt_free(struct sctp_packet *packet); -- cgit v1.1 From fc8bcaa05160528d56432e4612f522e3ceafc513 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:48 -0700 Subject: net: LLC: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Hans Liljestrand Cc: "Paul E. McKenney" Cc: "Reshetova, Elena" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/llc_c_ac.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/llc_c_ac.h b/include/net/llc_c_ac.h index f3be818..e766300 100644 --- a/include/net/llc_c_ac.h +++ b/include/net/llc_c_ac.h @@ -171,10 +171,10 @@ int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_send_i_rsp_as_ack(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb); -void llc_conn_busy_tmr_cb(unsigned long timeout_data); -void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data); -void llc_conn_ack_tmr_cb(unsigned long timeout_data); -void llc_conn_rej_tmr_cb(unsigned long timeout_data); +void llc_conn_busy_tmr_cb(struct timer_list *t); +void llc_conn_pf_cycle_tmr_cb(struct timer_list *t); +void llc_conn_ack_tmr_cb(struct timer_list *t); +void llc_conn_rej_tmr_cb(struct timer_list *t); void llc_conn_set_p_flag(struct sock *sk, u8 value); #endif /* LLC_C_AC_H */ -- cgit v1.1 From 88ca59d1aaf28c25b47a9f933090e480ba6dc92a Mon Sep 17 00:00:00 2001 From: Girish Moodalbail Date: Wed, 25 Oct 2017 12:26:43 -0700 Subject: macvlan: remove unused fields in struct macvlan_dev commit 635b8c8ecdd2 ("tap: Renaming tap related APIs, data structures, macros") captured all the tap related fields into a new struct tap_dev. However, it failed to remove those fields from struct macvlan_dev. Those fields are currently unused and must be removed. While there I moved the comment for MAX_TAP_QUEUES to the right place. Fixes: 635b8c8ecdd27142 (tap: Renaming tap related APIs, data structures, macros) Signed-off-by: Girish Moodalbail Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 15 --------------- include/linux/if_tap.h | 4 ++++ 2 files changed, 4 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 10e319f..e13b369 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -10,13 +10,6 @@ #include struct macvlan_port; -struct macvtap_queue; - -/* - * Maximum times a macvtap device can be opened. This can be used to - * configure the number of receive queue, e.g. for multiqueue virtio. - */ -#define MAX_TAP_QUEUES 256 #define MACVLAN_MC_FILTER_BITS 8 #define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS) @@ -35,14 +28,6 @@ struct macvlan_dev { netdev_features_t set_features; enum macvlan_mode mode; u16 flags; - /* This array tracks active taps. */ - struct tap_queue __rcu *taps[MAX_TAP_QUEUES]; - /* This list tracks all taps (both enabled and disabled) */ - struct list_head queue_list; - int numvtaps; - int numqueues; - netdev_features_t tap_features; - int minor; int nest_level; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 4837157..d1b5173 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -22,6 +22,10 @@ static inline struct skb_array *tap_get_skb_array(struct file *f) #include #include +/* + * Maximum times a tap device can be opened. This can be used to + * configure the number of receive queue, e.g. for multiqueue virtio. + */ #define MAX_TAP_QUEUES 256 struct tap_queue; -- cgit v1.1 From 60e2a7780793bae0debc275a9ccd57f7da0cf195 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Oct 2017 11:01:45 +0200 Subject: tcp: TCP experimental option for SMC The SMC protocol [1] relies on the use of a new TCP experimental option [2, 3]. With this option, SMC capabilities are exchanged between peers during the TCP three way handshake. This patch adds support for this experimental option to TCP. References: [1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609 [2] Shared Use of TCP Experimental Options RFC 6994: https://tools.ietf.org/rfc/rfc6994.txt [3] IANA ExID SMCR: http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/linux/tcp.h | 9 +++++++-- include/net/inet_sock.h | 3 ++- include/net/tcp.h | 7 +++++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 173a7c2..8c43138 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -98,7 +98,8 @@ struct tcp_options_received { tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ - sack_ok : 4, /* SACK seen on SYN packet */ + sack_ok : 3, /* SACK seen on SYN packet */ + smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 num_sacks; /* Number of SACK blocks */ @@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; +#if IS_ENABLED(CONFIG_SMC) + rx_opt->smc_ok = 0; +#endif } /* This is the max number of SACKS that we'll generate and process. It's safe @@ -229,7 +233,8 @@ struct tcp_sock { syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ + syn_smc:1; /* SYN includes SMC */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 425752f..c49938d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -92,7 +92,8 @@ struct inet_request_sock { wscale_ok : 1, ecn_ok : 1, acked : 1, - no_srccheck: 1; + no_srccheck: 1, + smc_ok : 1; kmemcheck_bitfield_end(flags); u32 ir_mark; union { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2392f74..285bc82 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 +#define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths @@ -203,6 +204,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 +#define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 @@ -213,6 +215,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 +#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -2108,4 +2111,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); } + +#if IS_ENABLED(CONFIG_SMC) +extern struct static_key_false tcp_have_smc; +#endif #endif /* _TCP_H */ -- cgit v1.1 From 32d18ab1d44166cbb1dcaf8b359183636734ddb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 24 Oct 2017 12:41:01 +0200 Subject: net: updating dst lastusage is an unlikely event. Since commit 0da4af00b2ed ("ipv6: only update __use and lastusetime once per jiffy at most"), updating the dst lastuse field is an unlikely action: it happens at most once per jiffy, out of potentially millions of calls per second. Mark explicitly the code as such, and let the compiler generate better code. Note: gcc 7.2 and several older versions do actually generate different - better - code when the unlikely() hint is in place, avoid jump in the fast path and keeping better code locality. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 5047e805..2f53ecc 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -257,7 +257,7 @@ static inline void dst_hold(struct dst_entry *dst) static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - if (time != dst->lastuse) { + if (unlikely(time != dst->lastuse)) { dst->__use++; dst->lastuse = time; } -- cgit v1.1 From 032cfd66afcc2dd2c7be89c71b020fcb15bcc37d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Oct 2017 03:53:59 -0700 Subject: drivers/net: wan/sdla: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Allen Pais Cc: "David S. Miller" Cc: Tobias Klauser Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/if_frad.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h index 46df7e5..82a1b4e 100644 --- a/include/linux/if_frad.h +++ b/include/linux/if_frad.h @@ -83,6 +83,7 @@ struct frad_local /* fields that are used by the Sangoma SDLA cards */ struct timer_list timer; + struct net_device *dev; int type; /* adapter type */ int state; /* state of the S502/8 control latch */ int buffer; /* current buffer for S508 firmware */ -- cgit v1.1 From 2ae21cf527da0e5cf9d7ee14bd5b0909bb9d1a75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:56 -0700 Subject: tcp: Namespace-ify sysctl_tcp_early_retrans Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2c4222a..a7f39e3 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -128,6 +128,7 @@ struct netns_ipv4 { int sysctl_tcp_sack; int sysctl_tcp_window_scaling; int sysctl_tcp_timestamps; + int sysctl_tcp_early_retrans; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 285bc82..a12b71d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -265,7 +265,6 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; -extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_recovery; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -- cgit v1.1 From e20223f1962831d1b1c416d59d259879d0639d68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:57 -0700 Subject: tcp: Namespace-ify sysctl_tcp_recovery Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a7f39e3..d6ed718 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -129,6 +129,7 @@ struct netns_ipv4 { int sysctl_tcp_window_scaling; int sysctl_tcp_timestamps; int sysctl_tcp_early_retrans; + int sysctl_tcp_recovery; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index a12b71d..c7f5153 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -265,7 +265,7 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; -extern int sysctl_tcp_recovery; + #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ extern int sysctl_tcp_limit_output_bytes; -- cgit v1.1 From 2c04ac8ae0b61e0780a30b7069a11bb202b21f26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:58 -0700 Subject: tcp: Namespace-ify sysctl_tcp_thin_linear_timeouts Note that sysctl_tcp_thin_dupack was not used, I deleted it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d6ed718..2a9f37b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -130,6 +130,7 @@ struct netns_ipv4 { int sysctl_tcp_timestamps; int sysctl_tcp_early_retrans; int sysctl_tcp_recovery; + int sysctl_tcp_thin_linear_timeouts; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index c7f5153..063a7a4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -263,8 +263,6 @@ extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; -extern int sysctl_tcp_thin_linear_timeouts; -extern int sysctl_tcp_thin_dupack; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -- cgit v1.1 From b510f0d23a47c3d1f074fe583e7867dc4918fe02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:59 -0700 Subject: tcp: Namespace-ify sysctl_tcp_slow_start_after_idle Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2a9f37b..8662692 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -131,6 +131,7 @@ struct netns_ipv4 { int sysctl_tcp_early_retrans; int sysctl_tcp_recovery; int sysctl_tcp_thin_linear_timeouts; + int sysctl_tcp_slow_start_after_idle; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 063a7a4..cc2ab52 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -262,7 +262,6 @@ extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; -extern int sysctl_tcp_slow_start_after_idle; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ @@ -1308,7 +1307,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sysctl_tcp_slow_start_after_idle || tp->packets_out || + if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; -- cgit v1.1 From e0a1e5b519236dc1662ff25e42560dd1be9e3776 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:00 -0700 Subject: tcp: Namespace-ify sysctl_tcp_retrans_collapse Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8662692..b28c172 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -132,6 +132,7 @@ struct netns_ipv4 { int sysctl_tcp_recovery; int sysctl_tcp_thin_linear_timeouts; int sysctl_tcp_slow_start_after_idle; + int sysctl_tcp_retrans_collapse; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index cc2ab52..33cc863 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; -- cgit v1.1 From 3f4c7c6f6a9053493ce7dd8a0f17ed8eafc53893 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:01 -0700 Subject: tcp: Namespace-ify sysctl_tcp_stdurg Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b28c172..ffa2cf3 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -133,6 +133,7 @@ struct netns_ipv4 { int sysctl_tcp_thin_linear_timeouts; int sysctl_tcp_slow_start_after_idle; int sysctl_tcp_retrans_collapse; + int sysctl_tcp_stdurg; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 33cc863..cf3fac7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; -- cgit v1.1 From 625357aa175c688d219da43c8cfaa2e1629e0e1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:02 -0700 Subject: tcp: Namespace-ify sysctl_tcp_rfc1337 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ffa2cf3..968edce 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -134,6 +134,7 @@ struct netns_ipv4 { int sysctl_tcp_slow_start_after_idle; int sysctl_tcp_retrans_collapse; int sysctl_tcp_stdurg; + int sysctl_tcp_rfc1337; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index cf3fac7..2aea2b3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; extern int sysctl_tcp_fack; -- cgit v1.1 From 65c9410cf55ecf32da1b720f563365d565d6289a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:03 -0700 Subject: tcp: Namespace-ify sysctl_tcp_abort_on_overflow Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 968edce..3875fdf 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,7 @@ struct netns_ipv4 { int sysctl_tcp_retrans_collapse; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; + int sysctl_tcp_abort_on_overflow; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 2aea2b3..7331281 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; -- cgit v1.1 From 0bc65a28ae2aeb14aab7f4a930e0d8cf4cad9dc4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:04 -0700 Subject: tcp: Namespace-ify sysctl_tcp_fack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3875fdf..f0e792b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -136,6 +136,7 @@ struct netns_ipv4 { int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; int sysctl_tcp_abort_on_overflow; + int sysctl_tcp_fack; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7331281..e7b15e9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; -- cgit v1.1 From 773d4bb96ceca6829ae9928f6b002b93e2e62cdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:05 -0700 Subject: tcp: remove stale sysctl_tcp_reordering This extern is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e7b15e9..fc134ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_reordering; extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; -- cgit v1.1 From c6e218035913e14952b04ceecf1a543205106fdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:06 -0700 Subject: tcp: Namespace-ify sysctl_tcp_max_reordering Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f0e792b..3f68446 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -137,6 +137,7 @@ struct netns_ipv4 { int sysctl_tcp_rfc1337; int sysctl_tcp_abort_on_overflow; int sysctl_tcp_fack; + int sysctl_tcp_max_reordering; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index fc134ba..8cd2862 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; -- cgit v1.1 From 6496f6bde0c323fba5e8c5b5cbf3a7bf28dad7ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:07 -0700 Subject: tcp: Namespace-ify sysctl_tcp_dsack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3f68446..956957a7 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -138,6 +138,7 @@ struct netns_ipv4 { int sysctl_tcp_abort_on_overflow; int sysctl_tcp_fack; int sysctl_tcp_max_reordering; + int sysctl_tcp_dsack; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 8cd2862..8b2ae3e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -- cgit v1.1 From 0c12654ac6d9004b9538b2a969b2b59e9a5ed831 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:08 -0700 Subject: tcp: Namespace-ify sysctl_tcp_app_win Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 956957a7..63f91d5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -139,6 +139,7 @@ struct netns_ipv4 { int sysctl_tcp_fack; int sysctl_tcp_max_reordering; int sysctl_tcp_dsack; + int sysctl_tcp_app_win; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 8b2ae3e..7aa3d65 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; extern int sysctl_tcp_nometrics_save; -- cgit v1.1 From 94f0893e0c27219f4a726932618505aab6795973 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:09 -0700 Subject: tcp: Namespace-ify sysctl_tcp_adv_win_scale Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 63f91d5..9dbb07d 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -140,6 +140,7 @@ struct netns_ipv4 { int sysctl_tcp_max_reordering; int sysctl_tcp_dsack; int sysctl_tcp_app_win; + int sysctl_tcp_adv_win_scale; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7aa3d65..0dc27cd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; @@ -1311,9 +1310,9 @@ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); -static inline int tcp_win_from_space(int space) +static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = sysctl_tcp_adv_win_scale; + int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; return tcp_adv_win_scale <= 0 ? (space>>(-tcp_adv_win_scale)) : @@ -1323,13 +1322,13 @@ static inline int tcp_win_from_space(int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk->sk_rcvbuf - + return tcp_win_from_space(sk, sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { - return tcp_win_from_space(sk->sk_rcvbuf); + return tcp_win_from_space(sk, sk->sk_rcvbuf); } extern void tcp_openreq_init_rwin(struct request_sock *req, -- cgit v1.1 From af9b69a7a6ca6b817e8d6f416e7aa5b2a5bf1d91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:10 -0700 Subject: tcp: Namespace-ify sysctl_tcp_frto Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9dbb07d..f4622e2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -141,6 +141,7 @@ struct netns_ipv4 { int sysctl_tcp_dsack; int sysctl_tcp_app_win; int sysctl_tcp_adv_win_scale; + int sysctl_tcp_frto; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0dc27cd..18f0475 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_frto; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; -- cgit v1.1 From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Thu, 26 Oct 2017 01:47:42 +0000 Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related syscall") included linux/bpf.h in linux/security.h. As a result, bpf programs including bpf_helpers.h and some other header that ends up pulling in also security.h, such as several examples under samples/bpf, fail to compile because bpf_tail_call and bpf_get_stackid are now "redefined as different kind of symbol". >From bpf.h: u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); Whereas in bpf_helpers.h they are: static void (*bpf_tail_call)(void *ctx, void *map, int index); static int (*bpf_get_stackid)(void *ctx, void *map, int flags); Fix this by removing the unused declaration of bpf_tail_call and moving the declaration of bpf_get_stackid in bpf_trace.c, which is the only place where it's needed. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 172be7f..520aeeb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -231,9 +231,6 @@ struct bpf_event_entry { struct rcu_head rcu; }; -u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); -u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); - bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); -- cgit v1.1 From 356c3e9afac0cc19c3d3b0cbc67106ce8efa0743 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Thu, 26 Oct 2017 11:00:48 +0200 Subject: net: dsa: lan9303: Move struct lan9303 to include/linux/dsa/lan9303.h The next patch require net/dsa/tag_lan9303.c to access struct lan9303. Therefore move struct lan9303 definitions from drivers/net/dsa/lan9303.h to new file include/linux/dsa/lan9303.h. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/dsa/lan9303.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/linux/dsa/lan9303.h (limited to 'include') diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h new file mode 100644 index 0000000..05d8d13 --- /dev/null +++ b/include/linux/dsa/lan9303.h @@ -0,0 +1,36 @@ +/* Included by drivers/net/dsa/lan9303.h and net/dsa/tag_lan9303.c */ +#include + +struct lan9303; + +struct lan9303_phy_ops { + /* PHY 1 and 2 access*/ + int (*phy_read)(struct lan9303 *chip, int port, int regnum); + int (*phy_write)(struct lan9303 *chip, int port, + int regnum, u16 val); +}; + +#define LAN9303_NUM_ALR_RECORDS 512 +struct lan9303_alr_cache_entry { + u8 mac_addr[ETH_ALEN]; + u8 port_map; /* Bitmap of ports. Zero if unused entry */ + u8 stp_override; /* non zero if set ALR_DAT1_AGE_OVERRID */ +}; + +struct lan9303 { + struct device *dev; + struct regmap *regmap; + struct regmap_irq_chip_data *irq_data; + struct gpio_desc *reset_gpio; + u32 reset_duration; /* in [ms] */ + bool phy_addr_sel_strap; + struct dsa_switch *ds; + struct mutex indirect_mutex; /* protect indexed register access */ + const struct lan9303_phy_ops *ops; + bool is_bridged; /* true if port 1 and 2 are bridged */ + u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */ + /* LAN9303 do not offer reading specific ALR entry. Cache all + * static entries in a flat table + **/ + struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS]; +}; -- cgit v1.1 From bff7b688d5b10a8cb8cecefdea5e255408b78f2f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:51 -0400 Subject: net: dsa: add dsa_is_unused_port helper As the comment above the chunk states, the b53 driver attempts to disable the unused ports. But using ds->enabled_port_mask is misleading, because this mask reports in fact the user ports. To avoid confusion and fix this, this patch introduces an explicit dsa_is_unused_port helper which ensures the corresponding bit is not masked in any of the switch port masks. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 38961ef..6b1bc1c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -254,6 +254,13 @@ struct dsa_switch { struct dsa_port ports[]; }; +static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) +{ + u32 m = ds->enabled_port_mask | ds->dsa_port_mask | ds->cpu_port_mask; + + return !(m & BIT(p)); +} + static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return !!(ds->cpu_port_mask & (1 << p)); -- cgit v1.1 From deb8ee0b51204273c120a3b3848efbb5695ad658 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:53 -0400 Subject: net: dsa: fix dsa_is_normal_port helper In order to know if a port is of type user, dsa_is_normal_port checks that the given port is not of type DSA nor CPU. This is not enough because a port can be unused. Without the previous fix, this caused the unused mv88e6xxx ports to be configured in normal mode. The ds->enabled_port_mask reports the user ports, so check this instead. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6b1bc1c..4ad432a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -273,7 +273,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) { - return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p); + return !!(ds->enabled_port_mask & BIT(p)); } static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) -- cgit v1.1 From 2b3e9891cb607f7c7d5a4b11fb5a6e775e7f3ef4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:54 -0400 Subject: net: dsa: rename dsa_is_normal_port helper This patch renames dsa_is_normal_port to dsa_is_user_port because "user" is the correct term in the DSA terminology, not "normal". Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4ad432a..49701d95 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -271,7 +271,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) return !!((ds->dsa_port_mask) & (1 << p)); } -static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) +static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return !!(ds->enabled_port_mask & BIT(p)); } -- cgit v1.1 From 02bc6e546e858b209c3ebe380a13a73b333b1b3f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:56 -0400 Subject: net: dsa: introduce dsa_user_ports helper Introduce a dsa_user_ports() helper to return the ds->enabled_port_mask mask which is more explicit. This will also minimize diffs when touching this internal mask. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 49701d95..dc77280 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -276,6 +276,11 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) return !!(ds->enabled_port_mask & BIT(p)); } +static inline u32 dsa_user_ports(struct dsa_switch *ds) +{ + return ds->enabled_port_mask; +} + static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { return &ds->ports[p]; -- cgit v1.1 From 057cad2c59d73b0c4a6638546f3099d6fb444094 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:57 -0400 Subject: net: dsa: define port types Introduce an enumerated type for ports, which will be way more explicit to identify a port type instead of digging into switch port masks. A port can be of type CPU, DSA, user, or unused by default. This is a static parsed information that cannot be changed at runtime. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index dc77280..8da20c4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -180,6 +180,13 @@ struct dsa_port { struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + enum { + DSA_PORT_TYPE_UNUSED = 0, + DSA_PORT_TYPE_CPU, + DSA_PORT_TYPE_DSA, + DSA_PORT_TYPE_USER, + } type; + struct dsa_switch *ds; unsigned int index; const char *name; -- cgit v1.1 From c38c5a66506e4e8223fd03e950b1bde99190701e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:58 -0400 Subject: net: dsa: use new port type in helpers Now that DSA exposes an enumerated type for the ports, we can use them directly instead of checking bitmaps, which is more consistent. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8da20c4..07dfbd7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -261,36 +261,41 @@ struct dsa_switch { struct dsa_port ports[]; }; -static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) +static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { - u32 m = ds->enabled_port_mask | ds->dsa_port_mask | ds->cpu_port_mask; + return &ds->ports[p]; +} - return !(m & BIT(p)); +static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) +{ + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { - return !!(ds->cpu_port_mask & (1 << p)); + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { - return !!((ds->dsa_port_mask) & (1 << p)); + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { - return !!(ds->enabled_port_mask & BIT(p)); + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } static inline u32 dsa_user_ports(struct dsa_switch *ds) { - return ds->enabled_port_mask; -} + u32 mask = 0; + int p; -static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) -{ - return &ds->ports[p]; + for (p = 0; p < ds->num_ports; p++) + if (dsa_is_user_port(ds, p)) + mask |= BIT(p); + + return mask; } static inline u8 dsa_upstream_port(struct dsa_switch *ds) -- cgit v1.1 From 5749f0f3772b9d98f37e3a92539f49fafaa64eca Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:59 -0400 Subject: net: dsa: remove port masks Now that DSA core provides port types, there is no need to keep this information at the switch level. This is a static information that is part of a DSA core dsa_port structure. Remove them. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 07dfbd7..50e276d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -240,9 +240,6 @@ struct dsa_switch { /* * Slave mii_bus and devices for the individual ports. */ - u32 dsa_port_mask; - u32 cpu_port_mask; - u32 enabled_port_mask; u32 phys_mii_mask; struct mii_bus *slave_mii_bus; -- cgit v1.1 From 585d763af09cc21daf48ecc873604ccdb70f6014 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 16 Oct 2017 18:01:26 -0700 Subject: net/sched: Introduce Credit Based Shaper (CBS) qdisc This queueing discipline implements the shaper algorithm defined by the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L. It's primary usage is to apply some bandwidth reservation to user defined traffic classes, which are mapped to different queues via the mqprio qdisc. Only a simple software implementation is added for now. Signed-off-by: Vinicius Costa Gomes Signed-off-by: Jesus Sanchez-Palencia Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- include/uapi/linux/pkt_sched.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e7cc3d3..0e88cc2 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -904,4 +904,23 @@ struct tc_pie_xstats { __u32 maxq; /* maximum queue size */ __u32 ecn_mark; /* packets marked with ecn*/ }; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + #endif -- cgit v1.1 From 3d0bd028ffb4a4915cb64cfa0d2cee1578cc0321 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 16 Oct 2017 18:01:27 -0700 Subject: net/sched: Add support for HW offloading for CBS This adds support for offloading the CBS algorithm to the controller, if supported. Drivers wanting to support CBS offload must implement the .ndo_setup_tc callback and handle the TC_SETUP_CBS (introduced here) type. Signed-off-by: Vinicius Costa Gomes Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 1 + include/net/pkt_sched.h | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6c7960c8..5e02f79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -776,6 +776,7 @@ enum tc_setup_type { TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, TC_SETUP_BLOCK, + TC_SETUP_CBS, }; /* These structures hold the attributes of xdp state that are being passed diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index b8ecafc..02f2db2 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -140,4 +140,13 @@ static inline struct net *qdisc_net(struct Qdisc *q) return dev_net(q->dev_queue->dev); } +struct tc_cbs_qopt_offload { + u8 enable; + s32 queue; + s32 hicredit; + s32 locredit; + s32 idleslope; + s32 sendslope; +}; + #endif -- cgit v1.1 From ec36e416f06f6a8659281053fdc46ce484ad2211 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:21 -0700 Subject: tcp: Namespace-ify sysctl_tcp_nometrics_save Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f4622e2..9606e2e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -142,6 +142,7 @@ struct netns_ipv4 { int sysctl_tcp_app_win; int sysctl_tcp_adv_win_scale; int sysctl_tcp_frto; + int sysctl_tcp_nometrics_save; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 18f0475..6ab7fa4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; -- cgit v1.1 From 4540c0cf98b8892a642d2453eec20ae3eb5696fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:22 -0700 Subject: tcp: Namespace-ify sysctl_tcp_moderate_rcvbuf Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9606e2e..4458a54f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -143,6 +143,7 @@ struct netns_ipv4 { int sysctl_tcp_adv_win_scale; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; + int sysctl_tcp_moderate_rcvbuf; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ab7fa4..f954e74 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; -- cgit v1.1 From d06a99045837d3f4d5431793c4c390b0daf2a08d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:23 -0700 Subject: tcp: Namespace-ify sysctl_tcp_tso_win_divisor Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 4458a54f..60bccda 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -144,6 +144,7 @@ struct netns_ipv4 { int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf; + int sysctl_tcp_tso_win_divisor; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index f954e74..ed0828d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -- cgit v1.1 From ceef9ab6be7234f9e49f79769e0da88d1dccfcc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:24 -0700 Subject: tcp: Namespace-ify sysctl_tcp_workaround_signed_windows Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 60bccda..e74c7c1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -145,6 +145,7 @@ struct netns_ipv4 { int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; + int sysctl_tcp_workaround_signed_windows; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index ed0828d..e338e16 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_workaround_signed_windows; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ @@ -1302,7 +1301,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) } /* Determine a window scaling and initial window to offer. */ -void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, +void tcp_select_initial_window(const struct sock *sk, int __space, + __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); -- cgit v1.1 From 9184d8bb448a3d2c2d9f90f1e2f5de625292e769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:25 -0700 Subject: tcp: Namespace-ify sysctl_tcp_limit_output_bytes Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e74c7c1..e98f473 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -146,6 +146,7 @@ struct netns_ipv4 { int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; int sysctl_tcp_workaround_signed_windows; + int sysctl_tcp_limit_output_bytes; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index e338e16..33f9d30 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; -- cgit v1.1 From b530b68148301d73775cd27cc136ce4dd5738ae8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:26 -0700 Subject: tcp: Namespace-ify sysctl_tcp_challenge_ack_limit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e98f473..e9895d4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -147,6 +147,7 @@ struct netns_ipv4 { int sysctl_tcp_tso_win_divisor; int sysctl_tcp_workaround_signed_windows; int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_challenge_ack_limit; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 33f9d30..afc2359 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; -- cgit v1.1 From 26e9596e5b8f11025b57b12e7265df649129ab00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:27 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_tso_segs Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e9895d4..a2da3e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { int sysctl_tcp_workaround_signed_windows; int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; + int sysctl_tcp_min_tso_segs; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index afc2359..0735303 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; -- cgit v1.1 From bd239704295c66196e6b77c5717ec4aec076ddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:28 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_rtt_wlen Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a2da3e1..1a66af8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 { int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; int sysctl_tcp_min_tso_segs; + int sysctl_tcp_min_rtt_wlen; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0735303..56f50c9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_pacing_ss_ratio; -- cgit v1.1 From 790f00e19f65673c3c169dfc137c09a9236847d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:29 -0700 Subject: tcp: Namespace-ify sysctl_tcp_autocorking Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1a66af8..5378308 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -150,6 +150,7 @@ struct netns_ipv4 { int sysctl_tcp_challenge_ack_limit; int sysctl_tcp_min_tso_segs; int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_autocorking; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 56f50c9..0268f10 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ca_ratio; -- cgit v1.1 From 4170ba6b589ced82da56c7e4f71cc84b2be036d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:30 -0700 Subject: tcp: Namespace-ify sysctl_tcp_invalid_ratelimit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5378308..e52c212 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -151,6 +151,7 @@ struct netns_ipv4 { int sysctl_tcp_min_tso_segs; int sysctl_tcp_min_rtt_wlen; int sysctl_tcp_autocorking; + int sysctl_tcp_invalid_ratelimit; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0268f10..5869a82 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ca_ratio; -- cgit v1.1 From 23a7102a2d1068508fa2a0ce593a0df7f8fdc0ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:31 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ss_ratio Also remove an obsolete comment about TCP pacing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e52c212..eb2dcf1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -152,6 +152,7 @@ struct netns_ipv4 { int sysctl_tcp_min_rtt_wlen; int sysctl_tcp_autocorking; int sysctl_tcp_invalid_ratelimit; + int sysctl_tcp_pacing_ss_ratio; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5869a82..2a5f826 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; -- cgit v1.1 From c26e91f8b9b8e1fd252e07c1f60e50220cd7ebab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:32 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ca_ratio Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index eb2dcf1..141ba82 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -153,6 +153,7 @@ struct netns_ipv4 { int sysctl_tcp_autocorking; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; + int sysctl_tcp_pacing_ca_ratio; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 2a5f826..092d606 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,8 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_pacing_ca_ratio; - extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; -- cgit v1.1 From 5b52a4c3acf5f4b4854d1c3ddc8be8770330a79c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 27 Oct 2017 10:01:39 -0700 Subject: tcp: remove unnecessary include two extra #include are not necessary in tcp.h Remove them. Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tcp.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 092d606..aa1cc90 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -45,9 +45,6 @@ #include #include - -#include -#include #include extern struct inet_hashinfo tcp_hashinfo; -- cgit v1.1 From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 27 Oct 2017 17:30:12 -0700 Subject: ipv6: prevent user from adding cached routes Cached routes should only be created by the system when receiving pmtu discovery or ip redirect msg. Users should not be allowed to create cached routes. Furthermore, after the patch series to move cached routes into exception table, user added cached routes will trigger the following warning in fib6_add(): WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137 fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297 RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814 RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780 RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360 R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521 sock_do_ioctl+0x65/0xb0 net/socket.c:961 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe So we fix this by failing the attemp to add cached routes from userspace with returning EINVAL error. Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index d496c02..c15d805 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -28,7 +28,7 @@ #define RTF_ROUTEINFO 0x00800000 /* route information - RA */ -#define RTF_CACHE 0x01000000 /* cache entry */ +#define RTF_CACHE 0x01000000 /* read-only: can not be set by user */ #define RTF_FLOW 0x02000000 /* flow significant route */ #define RTF_POLICY 0x04000000 /* policy route */ -- cgit v1.1 From a190d04db93710ae166749055b6985397c6d13f5 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 26 Oct 2017 15:09:21 -0700 Subject: ipvlan: introduce 'private' attribute for all existing modes. IPvlan has always operated in bridge mode. However there are scenarios where each slave should be able to talk through the master device but not necessarily across each other. Think of an environment where each of a namespace is a private and independant customer. In this scenario the machine which is hosting these namespaces neither want to tell who their neighbor is nor the individual namespaces care to talk to neighbor on short-circuited network path. This patch implements the mode that is very similar to the 'private' mode in macvlan where individual slaves can send and receive traffic through the master device, just that they can not talk among slave devices. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b037e0a..052e32c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -465,6 +465,7 @@ enum macsec_validation_type { enum { IFLA_IPVLAN_UNSPEC, IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, __IFLA_IPVLAN_MAX }; @@ -477,6 +478,8 @@ enum ipvlan_mode { IPVLAN_MODE_MAX }; +#define IPVLAN_F_PRIVATE 0x01 + /* VXLAN section */ enum { IFLA_VXLAN_UNSPEC, -- cgit v1.1 From fe89aa6b250c1011ccf425fbb7998e96bd54263f Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 26 Oct 2017 15:09:25 -0700 Subject: ipvlan: implement VEPA mode This is very similar to the Macvlan VEPA mode, however, there is some difference. IPvlan uses the mac-address of the lower device, so the VEPA mode has implications of ICMP-redirects for packets destined for its immediate neighbors sharing same master since the packets will have same source and dest mac. The external switch/router will send redirect msg. Having said that, this will be useful tool in terms of debugging since IPvlan will not switch packets within its slaves and rely completely on the external entity as intended in 802.1Qbg. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 052e32c..81f2647 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -479,6 +479,7 @@ enum ipvlan_mode { }; #define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 /* VXLAN section */ enum { -- cgit v1.1 From 1f01d8be0e6a04bd682a55f6d50c14c1679e7571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20Zapa=C5=82owicz?= Date: Tue, 17 Oct 2017 15:53:49 +0200 Subject: Bluetooth: increase timeout for le auto connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch increases the connection timeout for LE connections that are triggered by the advertising report to 4 seconds. It has been observed that devices equipped with wifi+bt combo SoC fail to create a connection with BLE devices due to their coexistence issues. Increasing this timeout gives them enough time to complete the connection with success. Signed-off-by: Konrad Zapałowicz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fe98f0a..1668211 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -273,7 +273,7 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ -#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ +#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 -- cgit v1.1 From 2064ee332e4c1b7495cf68b84355c213d8fe71fd Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 30 Oct 2017 10:42:59 +0100 Subject: Bluetooth: Use bt_dev_err and bt_dev_info when possible In case of using BT_ERR and BT_INFO, convert to bt_dev_err and bt_dev_info when possible. This allows for controller specific reporting. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 020142b..e89cff0 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -147,6 +147,9 @@ void bt_err_ratelimited(const char *fmt, ...); #define bt_dev_dbg(hdev, fmt, ...) \ BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) +#define bt_dev_err_ratelimited(hdev, fmt, ...) \ + BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + /* Connection and socket states */ enum { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ -- cgit v1.1 From 384c181e3780ddc45e70483e29d84495b484730d Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 27 Oct 2017 02:35:34 -0700 Subject: net: sched: Identify hardware traffic classes using classid This patch offloads the classid to hardware and uses the classid reserved in the range :ffe0 - :ffef to identify hardware traffic classes reported via dev->num_tc. tcf_result structure contains the class ID of the class to which the packet belongs and is offloaded to hardware via flower filter. A new helper function is introduced to represent HW traffic classes 0 through 15 using the reserved classid values :ffe0 - :ffef. Signed-off-by: Amritha Nambiar Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/net/pkt_cls.h | 1 + include/net/sch_generic.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index bf73e16..37c5ef7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -666,6 +666,7 @@ struct tc_cls_flower_offload { struct fl_flow_key *mask; struct fl_flow_key *key; struct tcf_exts *exts; + u32 classid; }; enum tc_matchall_command { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 07c179d..c23e938 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -411,6 +411,13 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) return NULL; } +static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) +{ + u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; + + return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL; +} + int qdisc_class_hash_init(struct Qdisc_class_hash *); void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); -- cgit v1.1 From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 31 Oct 2017 18:16:05 -0700 Subject: bpf: reduce verifier memory consumption the verifier got progressively smarter over time and size of its internal state grew as well. Time to reduce the memory consumption. Before: sizeof(struct bpf_verifier_state) = 6520 After: sizeof(struct bpf_verifier_state) = 896 It's done by observing that majority of BPF programs use little to no stack whereas verifier kept all of 512 stack slots ready always. Instead dynamically reallocate struct verifier state when stack access is detected. Runtime difference before vs after is within a noise. The number of processed instructions stays the same. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index feeaea9..3b0976a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -88,14 +88,19 @@ enum bpf_stack_slot_type { #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +struct bpf_stack_state { + struct bpf_reg_state spilled_ptr; + u8 slot_type[BPF_REG_SIZE]; +}; + /* state of the program: * type of all registers and stack info */ struct bpf_verifier_state { struct bpf_reg_state regs[MAX_BPF_REG]; - u8 stack_slot_type[MAX_BPF_STACK]; - struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; struct bpf_verifier_state *parent; + int allocated_stack; + struct bpf_stack_state *stack; }; /* linked list of verifier states used to prune search */ @@ -145,7 +150,7 @@ struct bpf_verifier_env { struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ - struct bpf_verifier_state cur_state; /* current verifier state */ + struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ void *analyzer_priv; /* pointer to external analyzer's private data */ @@ -159,6 +164,11 @@ struct bpf_verifier_env { struct bpf_verifer_log log; }; +static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +{ + return env->cur_state->regs; +} + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv); -- cgit v1.1 From 6c31e5a91fde2e718e59c8a627c56451f88be54c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 Oct 2017 17:37:13 -0700 Subject: net: Add extack to fib_notifier_info Add extack to fib_notifier_info and plumb through stack to call_fib_rule_notifiers, call_fib_entry_notifiers and call_fib6_entry_notifiers. This allows notifer handlers to return messages to user. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 54cd6b8..c91ec73 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -9,6 +9,7 @@ struct fib_notifier_info { struct net *net; int family; + struct netlink_ext_ack *extack; }; enum fib_event_type { -- cgit v1.1 From da13c59b9936dfedcf9f2203bd29fbf83ad672bf Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 30 Oct 2017 19:38:52 -0400 Subject: net: display hw address of source machine during ipv6 DAD failure This patch updates the error messages displayed in kernel log to include hwaddress of the source machine that caused ipv6 duplicate address detection failures. Examples: a) When we receive a NA packet from another machine advertising our address: ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2001:db8:: on eth0! b) When we detect DAD failure during address assignment to an interface: IPv6: eth0: IPv6 duplicate address 2001:db8:: used by 34:ab:cd:56:11:e8 detected! v2: Changed %pI6 to %pI6c in ndisc_recv_na() Chaged the v6 address in the commit message to 2001:db8:: Suggested-by: Igor Lubashev Signed-off-by: Vishwanath Pai Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 15b5ffd..2a616ea 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -208,7 +208,7 @@ void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); -void addrconf_dad_failure(struct inet6_ifaddr *ifp); +void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); -- cgit v1.1 From e9292f2c03851ef81bef38579a0ee9c42140e586 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Tue, 31 Oct 2017 15:48:01 +0100 Subject: net: dsa: lan9303: Add STP ALR entry on port 0 STP BPDUs arriving on user ports must sent to CPU port only, for processing by the SW bridge. Add an ALR entry with STP state override to fix that. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/dsa/lan9303.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h index 05d8d13..b2110e6 100644 --- a/include/linux/dsa/lan9303.h +++ b/include/linux/dsa/lan9303.h @@ -34,3 +34,5 @@ struct lan9303 { **/ struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS]; }; + +#define eth_stp_addr eth_reserved_addr_base -- cgit v1.1 From 0b5a89caee5c9958c18cd933c7f8891e35b21781 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Nov 2017 11:47:38 +0100 Subject: net: sched: remove unused tc_should_offload helper tc_should_offload is no longer used, remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 37c5ef7..108dcdd 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -623,13 +623,6 @@ static inline bool tc_skip_hw(u32 flags) return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } -static inline bool tc_should_offload(const struct net_device *dev, u32 flags) -{ - if (tc_skip_hw(flags)) - return false; - return tc_can_offload(dev); -} - static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; -- cgit v1.1 From 70b5aee46782208c14d93b715e9f62f7fec844f1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Nov 2017 11:47:41 +0100 Subject: net: sched: remove ndo_setup_tc check from tc_can_offload Since tc_can_offload is always called from block callback or egdev callback, no need to check if ndo_setup_tc exists. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 108dcdd..d15c40c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -611,11 +611,7 @@ struct tc_cls_u32_offload { static inline bool tc_can_offload(const struct net_device *dev) { - if (!(dev->features & NETIF_F_HW_TC)) - return false; - if (!dev->netdev_ops->ndo_setup_tc) - return false; - return true; + return dev->features & NETIF_F_HW_TC; } static inline bool tc_skip_hw(u32 flags) -- cgit v1.1 From 1495dc9f0a711a54f8fec849ce7f3a8f585a11e5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 Nov 2017 11:48:00 -0700 Subject: security: bpf: replace include of linux/bpf.h with forward declarations Touching linux/bpf.h makes us rebuild a surprisingly large portion of the kernel. Remove the unnecessary dependency from security.h, it only needs forward declarations. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/security.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 18800b0..73f1ef6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -31,7 +31,6 @@ #include #include #include -#include struct linux_binprm; struct cred; @@ -1732,6 +1731,10 @@ static inline void securityfs_remove(struct dentry *dentry) #endif #ifdef CONFIG_BPF_SYSCALL +union bpf_attr; +struct bpf_map; +struct bpf_prog; +struct bpf_prog_aux; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); -- cgit v1.1 From 47d3d7ac656a1ffb9d0f0d3c845663ed6fd7e78d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Oct 2017 14:16:00 -0700 Subject: ipv6: Implement limits on Hop-by-Hop and Destination options RFC 8200 (IPv6) defines Hop-by-Hop options and Destination options extension headers. Both of these carry a list of TLVs which is only limited by the maximum length of the extension header (2048 bytes). By the spec a host must process all the TLVs in these options, however these could be used as a fairly obvious denial of service attack. I think this could in fact be a significant DOS vector on the Internet, one mitigating factor might be that many FWs drop all packets with EH (and obviously this is only IPv6) so an Internet wide attack might not be so effective (yet!). By my calculation, the worse case packet with TLVs in a standard 1500 byte MTU packet that would be processed by the stack contains 1282 invidual TLVs (including pad TLVS) or 724 two byte TLVs. I wrote a quick test program that floods a whole bunch of these packets to a host and sure enough there is substantial time spent in ip6_parse_tlv. These packets contain nothing but unknown TLVS (that are ignored), TLV padding, and bogus UDP header with zero payload length. 25.38% [kernel] [k] __fib6_clean_all 21.63% [kernel] [k] ip6_parse_tlv 4.21% [kernel] [k] __local_bh_enable_ip 2.18% [kernel] [k] ip6_pol_route.isra.39 1.98% [kernel] [k] fib6_walk_continue 1.88% [kernel] [k] _raw_write_lock_bh 1.65% [kernel] [k] dst_release This patch adds configurable limits to Destination and Hop-by-Hop options. There are three limits that may be set: - Limit the number of options in a Hop-by-Hop or Destination options extension header. - Limit the byte length of a Hop-by-Hop or Destination options extension header. - Disallow unrecognized options in a Hop-by-Hop or Destination options extension header. The limits are set in corresponding sysctls: ipv6.sysctl.max_dst_opts_cnt ipv6.sysctl.max_hbh_opts_cnt ipv6.sysctl.max_dst_opts_len ipv6.sysctl.max_hbh_opts_len If a max_*_opts_cnt is less than zero then unknown TLVs are disallowed. The number of known TLVs that are allowed is the absolute value of this number. If a limit is exceeded when processing an extension header the packet is dropped. Default values are set to 8 for options counts, and set to INT_MAX for maximum length. Note the choice to limit options to 8 is an arbitrary guess (roughly based on the fact that the stack supports three HBH options and just one destination option). These limits have being proposed in draft-ietf-6man-rfc6434-bis. Tested (by Martin Lau) I tested out 1 thread (i.e. one raw_udp process). I changed the net.ipv6.max_dst_(opts|hbh)_number between 8 to 2048. With sysctls setting to 2048, the softirq% is packed to 100%. With 8, the softirq% is almost unnoticable from mpstat. v2; - Code and documention cleanup. - Change references of RFC2460 to be RFC8200. - Add reference to RFC6434-bis where the limits will be in standard. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ipv6.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/net/netns/ipv6.h | 4 ++++ 2 files changed, 44 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3cda3b5..fb6d670 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -51,6 +51,46 @@ #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 +/* Limits on Hop-by-Hop and Destination options. + * + * Per RFC8200 there is no limit on the maximum number or lengths of options in + * Hop-by-Hop or Destination options other then the packet must fit in an MTU. + * We allow configurable limits in order to mitigate potential denial of + * service attacks. + * + * There are three limits that may be set: + * - Limit the number of options in a Hop-by-Hop or Destination options + * extension header + * - Limit the byte length of a Hop-by-Hop or Destination options extension + * header + * - Disallow unknown options + * + * The limits are expressed in corresponding sysctls: + * + * ipv6.sysctl.max_dst_opts_cnt + * ipv6.sysctl.max_hbh_opts_cnt + * ipv6.sysctl.max_dst_opts_len + * ipv6.sysctl.max_hbh_opts_len + * + * max_*_opts_cnt is the number of TLVs that are allowed for Destination + * options or Hop-by-Hop options. If the number is less than zero then unknown + * TLVs are disallowed and the number of known options that are allowed is the + * absolute value. Setting the value to INT_MAX indicates no limit. + * + * max_*_opts_len is the length limit in bytes of a Destination or + * Hop-by-Hop options extension header. Setting the value to INT_MAX + * indicates no length limit. + * + * If a limit is exceeded when processing an extension header the packet is + * silently discarded. + */ + +/* Default limits for Hop-by-Hop and Destination options */ +#define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ +#define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ + /* * Addr type * diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2ea1ed3..600ba1c 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -37,6 +37,10 @@ struct netns_sysctl_ipv6 { int idgen_delay; int flowlabel_state_ranges; int flowlabel_reflect; + int max_dst_opts_cnt; + int max_hbh_opts_cnt; + int max_dst_opts_len; + int max_hbh_opts_len; }; struct netns_ipv6 { -- cgit v1.1 From cf34ce3da1e41579296364509266c7dac573822a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 30 Oct 2017 14:41:35 -0700 Subject: tcp: add tracepoint trace_tcp_retransmit_synack() This tracepoint can be used to trace synack retransmits. It maintains pointer to struct request_sock. We cannot simply reuse trace_tcp_retransmit_skb() here, because the sk here is the LISTEN socket. The IP addresses and ports should be extracted from struct request_sock. Note that, like many other tracepoints, this patch uses IS_ENABLED in TP_fast_assign macro, which triggers sparse warning like: ./include/trace/events/tcp.h:274:1: error: directive in argument list ./include/trace/events/tcp.h:281:1: error: directive in argument list However, there is no good solution to avoid these warnings. To the best of our knowledge, these warnings are harmless. Signed-off-by: Song Liu Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 03699ba..07cccca 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -237,6 +237,62 @@ TRACE_EVENT(tcp_set_state, show_tcp_state_name(__entry->newstate)) ); +TRACE_EVENT(tcp_retransmit_synack, + + TP_PROTO(const struct sock *sk, const struct request_sock *req), + + TP_ARGS(sk, req), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(const void *, req) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_request_sock *ireq = inet_rsk(req); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + __entry->req = req; + + __entry->sport = ireq->ir_num; + __entry->dport = ntohs(ireq->ir_rmt_port); + + p32 = (__be32 *) __entry->saddr; + *p32 = ireq->ir_loc_addr; + + p32 = (__be32 *) __entry->daddr; + *p32 = ireq->ir_rmt_addr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = ireq->ir_v6_loc_addr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = ireq->ir_v6_rmt_addr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.1 From 054287295b1132c8742ea55f8e3af9cbd630c932 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Thu, 2 Nov 2017 10:36:48 +0100 Subject: net: Define eth_stp_addr in linux/etherdevice.h The lan9303 driver defines eth_stp_addr as a synonym to eth_reserved_addr_base to get the STP ethernet address 01:80:c2:00:00:00. eth_reserved_addr_base is also used to define the start of Bridge Reserved ethernet address range, which happen to be the STP address. br_dev_setup refer to eth_reserved_addr_base as a definition of STP address. Clean up by: - Move the eth_stp_addr definition to linux/etherdevice.h - Use eth_stp_addr instead of eth_reserved_addr_base in br_dev_setup. Signed-off-by: Egil Hjelmeland Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/dsa/lan9303.h | 2 -- include/linux/etherdevice.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h index b2110e6..05d8d13 100644 --- a/include/linux/dsa/lan9303.h +++ b/include/linux/dsa/lan9303.h @@ -34,5 +34,3 @@ struct lan9303 { **/ struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS]; }; - -#define eth_stp_addr eth_reserved_addr_base diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2d9f808..263dbca 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -66,6 +66,7 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; +#define eth_stp_addr eth_reserved_addr_base /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local -- cgit v1.1 From 3ae6ec08292f01c6782d1a80be0b2cc675e0ecfc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 2 Nov 2017 17:14:05 +0100 Subject: ipv4: Send a netevent whenever multipath hash policy is changed Devices performing IPv4 forwarding need to update their multipath hash policy whenever it is changed. Inform these devices by generating a netevent. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netevent.h b/include/net/netevent.h index f440df1..e3f0e8f 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -25,6 +25,7 @@ enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ + NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); -- cgit v1.1 From c7eb7d7230509ec862d4144f7a831f995bc5d028 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Nov 2017 11:46:24 +0100 Subject: net: sched: introduce chain_head_change callback Add a callback that is to be called whenever head of the chain changes. Also provide a callback for the default case when the caller gets a block using non-extended getter. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 14 ++++++-------- include/net/sch_generic.h | 5 ++++- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d15c40c..505d4b7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -26,6 +26,8 @@ enum tcf_block_binder_type { struct tcf_block_ext_info { enum tcf_block_binder_type binder_type; + tcf_chain_head_change_t *chain_head_change; + void *chain_head_change_priv; }; struct tcf_block_cb; @@ -37,12 +39,10 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); -int tcf_block_get_ext(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei); void tcf_block_put(struct tcf_block *block); -void tcf_block_put_ext(struct tcf_block *block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); static inline struct Qdisc *tcf_block_q(struct tcf_block *block) @@ -82,8 +82,7 @@ int tcf_block_get(struct tcf_block **p_block, } static inline -int tcf_block_get_ext(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei) { return 0; @@ -94,8 +93,7 @@ static inline void tcf_block_put(struct tcf_block *block) } static inline -void tcf_block_put_ext(struct tcf_block *block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c23e938..f230269 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -260,9 +260,12 @@ struct qdisc_skb_cb { unsigned char data[QDISC_CB_PRIV_LEN]; }; +typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); + struct tcf_chain { struct tcf_proto __rcu *filter_chain; - struct tcf_proto __rcu **p_filter_chain; + tcf_chain_head_change_t *chain_head_change; + void *chain_head_change_priv; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ -- cgit v1.1 From 46209401f8f6116bd0b2c2d14a63958e83ffca0b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Nov 2017 11:46:25 +0100 Subject: net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath In sch_handle_egress and sch_handle_ingress tp->q is used only in order to update stats. So stats and filter list are the only things that are needed in clsact qdisc fastpath processing. Introduce new mini_Qdisc struct to hold those items. Also, introduce a helper to swap the mini_Qdisc structures in case filter list head changes. This removes need for tp->q usage without added overhead. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 ++++++--- include/net/sch_generic.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5e02f79..7de7656 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1559,6 +1559,8 @@ enum netdev_priv_flags { * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one + * @miniq_ingress: ingress/clsact qdisc specific data for + * ingress processing * @ingress_queue: XXX: need comments on this one * @broadcast: hw bcast address * @@ -1576,7 +1578,8 @@ enum netdev_priv_flags { * @tx_global_lock: XXX: need comments on this one * * @xps_maps: XXX: need comments on this one - * + * @miniq_egress: clsact qdisc specific data for + * egress processing * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers @@ -1795,7 +1798,7 @@ struct net_device { void __rcu *rx_handler_data; #ifdef CONFIG_NET_CLS_ACT - struct tcf_proto __rcu *ingress_cl_list; + struct mini_Qdisc __rcu *miniq_ingress; #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS @@ -1826,7 +1829,7 @@ struct net_device { struct xps_dev_maps __rcu *xps_maps; #endif #ifdef CONFIG_NET_CLS_ACT - struct tcf_proto __rcu *egress_cl_list; + struct mini_Qdisc __rcu *miniq_egress; #endif /* These may be needed for future network-power-down code. */ diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f230269..c64e62c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -904,4 +904,36 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } +/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. + * The fast path only needs to access filter list and to update stats + */ +struct mini_Qdisc { + struct tcf_proto *filter_list; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + struct rcu_head rcu; +}; + +static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, + const struct sk_buff *skb) +{ + bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); +} + +static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) +{ + this_cpu_inc(miniq->cpu_qstats->drops); +} + +struct mini_Qdisc_pair { + struct mini_Qdisc miniq1; + struct mini_Qdisc miniq2; + struct mini_Qdisc __rcu **p_miniq; +}; + +void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, + struct tcf_proto *tp_head); +void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, + struct mini_Qdisc __rcu **p_miniq); + #endif -- cgit v1.1 From 27c565ae9d554fa1c00c799754cff43476c8d3b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Nov 2017 08:53:27 -0700 Subject: ipv6: remove IN6_ADDR_HSIZE from addrconf.h IN6_ADDR_HSIZE is private to addrconf.c, move it here to avoid confusion. Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 3357332..b623b65 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -59,9 +59,6 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; -#define IN6_ADDR_HSIZE_SHIFT 8 -#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) - int addrconf_init(void); void addrconf_cleanup(void); -- cgit v1.1 From ee20598194500e82c477cf13e52b58e569446ed0 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 18 Jul 2017 15:42:15 -0500 Subject: net/dcb: Add dscp to priority selector type IEEE specification P802.1Qcd/D2.1 defines priority selector 5. This APP TLV selector defines DSCP to priority map. This patch defines such DSCP selector. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/uapi/linux/dcbnl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index b6170a6..2c0c645 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -206,6 +206,7 @@ struct cee_pfc { #define IEEE_8021QAZ_APP_SEL_STREAM 2 #define IEEE_8021QAZ_APP_SEL_DGRAM 3 #define IEEE_8021QAZ_APP_SEL_ANY 4 +#define IEEE_8021QAZ_APP_SEL_DSCP 5 /* This structure contains the IEEE 802.1Qaz APP managed object. This * object is also used for the CEE std as well. -- cgit v1.1 From c02762eb20cb57ec5b7c037b056c37d5838c803f Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 18 Jul 2017 16:03:17 -0500 Subject: net/mlx5: QCAM register firmware command support The QCAM register provides capability bit for all the QoS registers using ACCESS_REG command. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 14 ++++++++++++++ include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 40 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index e32dbc4..6d79b3f 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1000,6 +1000,14 @@ enum mlx5_mcam_feature_groups { MLX5_MCAM_FEATURE_ENHANCED_FEATURES = 0x0, }; +enum mlx5_qcam_reg_groups { + MLX5_QCAM_REGS_FIRST_128 = 0x0, +}; + +enum mlx5_qcam_feature_groups { + MLX5_QCAM_FEATURE_ENHANCED_FEATURES = 0x0, +}; + /* GET Dev Caps macros */ #define MLX5_CAP_GEN(mdev, cap) \ MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) @@ -1108,6 +1116,12 @@ enum mlx5_mcam_feature_groups { #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \ MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld) +#define MLX5_CAP_QCAM_REG(mdev, fld) \ + MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_access_reg_cap_mask.reg_cap.fld) + +#define MLX5_CAP_QCAM_FEATURE(mdev, fld) \ + MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_feature_cap_mask.feature_cap.fld) + #define MLX5_CAP_FPGA(mdev, cap) \ MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 08c77b7..ed5be52 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -109,6 +109,7 @@ enum { enum { MLX5_REG_QETCR = 0x4005, MLX5_REG_QTCT = 0x400a, + MLX5_REG_QCAM = 0x4019, MLX5_REG_DCBX_PARAM = 0x4020, MLX5_REG_DCBX_APP = 0x4021, MLX5_REG_FPGA_CAP = 0x4022, @@ -798,6 +799,7 @@ struct mlx5_core_dev { u32 pcam[MLX5_ST_SZ_DW(pcam_reg)]; u32 mcam[MLX5_ST_SZ_DW(mcam_reg)]; u32 fpga[MLX5_ST_SZ_DW(fpga_cap)]; + u32 qcam[MLX5_ST_SZ_DW(qcam_reg)]; } caps; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6977234..f127c5b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -838,7 +838,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cc_modify_allowed[0x1]; u8 start_pad[0x1]; u8 cache_line_128byte[0x1]; - u8 reserved_at_165[0xb]; + u8 reserved_at_165[0xa]; + u8 qcam_reg[0x1]; u8 gid_table_size[0x10]; u8 out_of_seq_cnt[0x1]; @@ -7890,6 +7891,43 @@ struct mlx5_ifc_mcam_reg_bits { u8 reserved_at_1c0[0x80]; }; +struct mlx5_ifc_qcam_access_reg_cap_mask { + u8 qcam_access_reg_cap_mask_127_to_20[0x6C]; + u8 qpdpm[0x1]; + u8 qcam_access_reg_cap_mask_18_to_4[0x0F]; + u8 qdpm[0x1]; + u8 qpts[0x1]; + u8 qcap[0x1]; + u8 qcam_access_reg_cap_mask_0[0x1]; +}; + +struct mlx5_ifc_qcam_qos_feature_cap_mask { + u8 qcam_qos_feature_cap_mask_127_to_1[0x7F]; + u8 qpts_trust_both[0x1]; +}; + +struct mlx5_ifc_qcam_reg_bits { + u8 reserved_at_0[0x8]; + u8 feature_group[0x8]; + u8 reserved_at_10[0x8]; + u8 access_reg_group[0x8]; + u8 reserved_at_20[0x20]; + + union { + struct mlx5_ifc_qcam_access_reg_cap_mask reg_cap; + u8 reserved_at_0[0x80]; + } qos_access_reg_cap_mask; + + u8 reserved_at_c0[0x80]; + + union { + struct mlx5_ifc_qcam_qos_feature_cap_mask feature_cap; + u8 reserved_at_0[0x80]; + } qos_feature_cap_mask; + + u8 reserved_at_1c0[0x80]; +}; + struct mlx5_ifc_pcap_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; -- cgit v1.1 From 71c70eb21c33c60433b95e72a59d40bb128db649 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 2 Aug 2017 21:36:23 -0500 Subject: net/mlx5: Add MLX5_SET16 and MLX5_GET16 Add MLX5_SET16 and MLX5_GET16 for 16bit structure field in firmware command. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Reviewed-by: Eli Cohen Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 6d79b3f..409ffb1 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -49,11 +49,15 @@ #define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0) #define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld) #define __mlx5_bit_off(typ, fld) (offsetof(struct mlx5_ifc_##typ##_bits, fld)) +#define __mlx5_16_off(typ, fld) (__mlx5_bit_off(typ, fld) / 16) #define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32) #define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64) +#define __mlx5_16_bit_off(typ, fld) (16 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0xf)) #define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f)) #define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) #define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld)) +#define __mlx5_mask16(typ, fld) ((u16)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) +#define __mlx5_16_mask(typ, fld) (__mlx5_mask16(typ, fld) << __mlx5_16_bit_off(typ, fld)) #define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits) #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8) @@ -116,6 +120,19 @@ __mlx5_mask(typ, fld)) ___t; \ }) +#define MLX5_GET16(typ, p, fld) ((be16_to_cpu(*((__be16 *)(p) +\ +__mlx5_16_off(typ, fld))) >> __mlx5_16_bit_off(typ, fld)) & \ +__mlx5_mask16(typ, fld)) + +#define MLX5_SET16(typ, p, fld, v) do { \ + u16 _v = v; \ + BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 16); \ + *((__be16 *)(p) + __mlx5_16_off(typ, fld)) = \ + cpu_to_be16((be16_to_cpu(*((__be16 *)(p) + __mlx5_16_off(typ, fld))) & \ + (~__mlx5_16_mask(typ, fld))) | (((_v) & __mlx5_mask16(typ, fld)) \ + << __mlx5_16_bit_off(typ, fld))); \ +} while (0) + /* Big endian getters */ #define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\ __mlx5_64_off(typ, fld))) -- cgit v1.1 From 415a64aa8dc6b4fc478609c549ca652d95a12f13 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 18 Jul 2017 16:08:46 -0500 Subject: net/mlx5: QPTS and QPDPM register firmware command support The QPTS register allows changing the priority trust state between pcp and dscp. Add support to get/set trust state from device. When the port is in pcp/dscp trust state, packet is routed by hardware to matching priority based on its pcp/dscp value respectively. The QPDPM register allow channing the dscp to priority mapping. Add support to get/set dscp to priority mapping from device. Note that to change a dscp mapping, the "e" bit of this dscp structure must be set in the QPDPM firmware command. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 7 +++++++ include/linux/mlx5/mlx5_ifc.h | 20 ++++++++++++++++++++ include/linux/mlx5/port.h | 5 +++++ 3 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ed5be52..a886b51 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -107,8 +107,10 @@ enum { }; enum { + MLX5_REG_QPTS = 0x4002, MLX5_REG_QETCR = 0x4005, MLX5_REG_QTCT = 0x400a, + MLX5_REG_QPDPM = 0x4013, MLX5_REG_QCAM = 0x4019, MLX5_REG_DCBX_PARAM = 0x4020, MLX5_REG_DCBX_APP = 0x4021, @@ -142,6 +144,11 @@ enum { MLX5_REG_MCAM = 0x907f, }; +enum mlx5_qpts_trust_state { + MLX5_QPTS_TRUST_PCP = 1, + MLX5_QPTS_TRUST_DSCP = 2, +}; + enum mlx5_dcbx_oper_mode { MLX5E_DCBX_PARAM_VER_OPER_HOST = 0x0, MLX5E_DCBX_PARAM_VER_OPER_AUTO = 0x3, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f127c5b..3e5363f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8578,6 +8578,26 @@ struct mlx5_ifc_qetc_reg_bits { struct mlx5_ifc_ets_global_config_reg_bits global_configuration; }; +struct mlx5_ifc_qpdpm_dscp_reg_bits { + u8 e[0x1]; + u8 reserved_at_01[0x0b]; + u8 prio[0x04]; +}; + +struct mlx5_ifc_qpdpm_reg_bits { + u8 reserved_at_0[0x8]; + u8 local_port[0x8]; + u8 reserved_at_10[0x10]; + struct mlx5_ifc_qpdpm_dscp_reg_bits dscp[64]; +}; + +struct mlx5_ifc_qpts_reg_bits { + u8 reserved_at_0[0x8]; + u8 local_port[0x8]; + u8 reserved_at_10[0x2d]; + u8 trust_state[0x3]; +}; + struct mlx5_ifc_qtct_reg_bits { u8 reserved_at_0[0x8]; u8 port_number[0x8]; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index c59af8a..035f0d4 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -179,4 +179,9 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); + +int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); +int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); +int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); +int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); #endif /* __MLX5_PORT_H__ */ -- cgit v1.1 From 9354d452034273a50a4fd703bea31e5d6b1fc20b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:37 -0200 Subject: openvswitch: reliable interface indentification in port dumps This patch allows reliable identification of netdevice interfaces connected to openvswitch bridges. In particular, user space queries the netdev interfaces belonging to the ports for statistics, up/down state, etc. Datapath dump needs to provide enough information for the user space to be able to do that. Currently, only interface names are returned. This is not sufficient, as openvswitch allows its ports to be in different name spaces and the interface name is valid only in its name space. What is needed and generally used in other netlink APIs, is the pair ifindex+netnsid. The solution is addition of the ifindex+netnsid pair (or only ifindex if in the same name space) to vport get/dump operation. On request side, ideally the ifindex+netnsid pair could be used to get/set/del the corresponding vport. This is not implemented by this patch and can be added later if needed. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index ffe397d..501e4c4 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -258,6 +258,8 @@ enum ovs_vport_attr { /* receiving upcalls */ OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */ OVS_VPORT_ATTR_PAD, + OVS_VPORT_ATTR_IFINDEX, + OVS_VPORT_ATTR_NETNSID, __OVS_VPORT_ATTR_MAX }; -- cgit v1.1 From 79e1ad148c844f5c8b9d76b36b26e3886dca95ae Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:38 -0200 Subject: rtnetlink: use netnsid to query interface Currently, when an application gets netnsid from the kernel (for example as the result of RTM_GETLINK call on one end of the veth pair), it's not much useful. There's no reliable way to get to the netns fd from the netnsid, nor does any kernel API accept netnsid. Extend the RTM_GETLINK call to also accept netnsid. It will operate on the netns with the given netnsid in such case. Of course, the calling process needs to have enough capabilities in the target name space; for now, require CAP_NET_ADMIN. This can be relaxed in the future. To signal to the calling process that the kernel understood the new IFLA_IF_NETNSID attribute in the query, it will include it in the response. This is needed to detect older kernels, as they will just ignore IFLA_IF_NETNSID and query in the current name space. This patch implemetns IFLA_IF_NETNSID only for get and dump. For set operations, this can be extended later. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b3cf563..19fc026 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -160,6 +160,7 @@ enum { IFLA_XDP, IFLA_EVENT, IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, __IFLA_MAX }; -- cgit v1.1 From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:16 -0700 Subject: net: bpf: rename ndo_xdp to ndo_bpf ndo_xdp is a control path callback for setting up XDP in the driver. We can reuse it for other forms of communication between the eBPF stack and the drivers. Rename the callback and associated structures and definitions. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7de7656..9af9fea 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -779,10 +779,10 @@ enum tc_setup_type { TC_SETUP_CBS, }; -/* These structures hold the attributes of xdp state that are being passed - * to the netdevice through the xdp op. +/* These structures hold the attributes of bpf state that are being passed + * to the netdevice through the bpf op. */ -enum xdp_netdev_command { +enum bpf_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are @@ -801,8 +801,8 @@ enum xdp_netdev_command { struct netlink_ext_ack; -struct netdev_xdp { - enum xdp_netdev_command command; +struct netdev_bpf { + enum bpf_netdev_command command; union { /* XDP_SETUP_PROG */ struct { @@ -1124,9 +1124,10 @@ struct dev_ifalias { * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. - * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp); + * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); * This function is used to set or query state related to XDP on the - * netdevice. See definition of enum xdp_netdev_command for details. + * netdevice and manage BPF offload. See definition of + * enum bpf_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. @@ -1315,8 +1316,8 @@ struct net_device_ops { struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); - int (*ndo_xdp)(struct net_device *dev, - struct netdev_xdp *xdp); + int (*ndo_bpf)(struct net_device *dev, + struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); void (*ndo_xdp_flush)(struct net_device *dev); @@ -3311,10 +3312,10 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp); +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id); +u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); -- cgit v1.1 From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:17 -0700 Subject: bpf: offload: add infrastructure for loading programs for a specific netdev The fact that we don't know which device the program is going to be used on is quite limiting in current eBPF infrastructure. We have to reverse or limit the changes which kernel makes to the loaded bytecode if we want it to be offloaded to a networking device. We also have to invent new APIs for debugging and troubleshooting support. Make it possible to load programs for a specific netdev. This helps us to bring the debug information closer to the core eBPF infrastructure (e.g. we will be able to reuse the verifer log in device JIT). It allows device JITs to perform translation on the original bytecode. __bpf_prog_get() when called to get a reference for an attachment point will now refuse to give it if program has a device assigned. Following patches will add a version of that function which passes the expected netdev in. @type argument in __bpf_prog_get() is renamed to attach_type to make it clearer that it's only set on attachment. All calls to ndo_bpf are protected by rtnl, only verifier callbacks are not. We need a wait queue to make sure netdev doesn't get destroyed while verifier is still running and calling its driver. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 10 ++++++++++ include/linux/netdevice.h | 14 ++++++++++++++ include/uapi/linux/bpf.h | 1 + 4 files changed, 61 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 520aeeb..e45d43f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -15,6 +15,7 @@ #include #include #include +#include struct perf_event; struct bpf_prog; @@ -182,6 +183,16 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); }; +struct bpf_dev_offload { + struct bpf_prog *prog; + struct net_device *netdev; + void *dev_priv; + struct list_head offloads; + bool dev_state; + bool verifier_running; + wait_queue_head_t verifier_done; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -199,6 +210,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_dev_offload *offload; union { struct work_struct work; struct rcu_head rcu; @@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops; #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; @@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, } #endif /* CONFIG_BPF_SYSCALL */ +int bpf_prog_offload_compile(struct bpf_prog *prog); +void bpf_prog_offload_destroy(struct bpf_prog *prog); + +#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); + +static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +{ + return aux->offload; +} +#else +static inline int bpf_prog_offload_init(struct bpf_prog *prog, + union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +{ + return false; +} +#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ + #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3b0976a..e45011d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -153,6 +153,7 @@ struct bpf_verifier_env { struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ + const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ void *analyzer_priv; /* pointer to external analyzer's private data */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ @@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return env->cur_state->regs; } +#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); +#else +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ + return -EOPNOTSUPP; +} +#endif + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9af9fea..fda527c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -797,8 +797,13 @@ enum bpf_netdev_command { * is equivalent to XDP_ATTACHED_DRV. */ XDP_QUERY_PROG, + /* BPF program for offload callbacks, invoked at program load time. */ + BPF_OFFLOAD_VERIFIER_PREP, + BPF_OFFLOAD_TRANSLATE, + BPF_OFFLOAD_DESTROY, }; +struct bpf_ext_analyzer_ops; struct netlink_ext_ack; struct netdev_bpf { @@ -815,6 +820,15 @@ struct netdev_bpf { u8 prog_attached; u32 prog_id; }; + /* BPF_OFFLOAD_VERIFIER_PREP */ + struct { + struct bpf_prog *prog; + const struct bpf_ext_analyzer_ops *ops; /* callee set */ + } verifier; + /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ + struct { + struct bpf_prog *prog; + } offload; }; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a982067..80d191a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -260,6 +260,7 @@ union bpf_attr { __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_target_ifindex; /* ifindex of netdev to prep for */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.1 From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:18 -0700 Subject: bpf: report offload info to user space Extend struct bpf_prog_info to contain information about program being bound to a device. Since the netdev may get destroyed while program still exists we need a flag to indicate the program is loaded for a device, even if the device is gone. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e45d43f..98bacd0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); +u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 80d191a..4455dd1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -895,6 +895,10 @@ enum sk_action { #define BPF_TAG_SIZE 8 +enum bpf_prog_status { + BPF_PROG_STATUS_DEV_BOUND = (1 << 0), +}; + struct bpf_prog_info { __u32 type; __u32 id; @@ -908,6 +912,8 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 status; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.1 From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:20 -0700 Subject: xdp: allow attaching programs loaded for specific device Pass the netdev pointer to bpf_prog_get_type(). This way BPF code can decide whether the device matches what the code was loaded/translated for. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 98bacd0..c397934 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -335,6 +335,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, + struct net_device *netdev); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); @@ -428,6 +430,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, { return ERR_PTR(-EOPNOTSUPP); } + +static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, + enum bpf_prog_type type, + struct net_device *netdev) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i) { -- cgit v1.1 From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:30 -0700 Subject: bpf: remove old offload/analyzer Thanks to the ability to load a program for a specific device, running verifier twice is no longer needed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e45011d..07b96aa 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -152,9 +152,7 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ - const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ - void *analyzer_priv; /* pointer to external analyzer's private data */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -179,7 +177,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) } #endif -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, - void *priv); - #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.1 From 99feaafcdb566e8f032e7acc2a303713ad6bf196 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:20 -0400 Subject: net: dsa: make switch index unsigned Define the DSA switch index as an unsigned int, because it will never be less than 0. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 50e276d..fa1c21ab 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -209,7 +209,7 @@ struct dsa_switch { * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; - int index; + unsigned int index; /* Listener for switch fabric events */ struct notifier_block nb; -- cgit v1.1 From 49463b7f2da1a115404b02c5533bc2c2125833a3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:21 -0400 Subject: net: dsa: make tree index unsigned Similarly to a DSA switch and port, rename the tree index from "tree" to "index" and make it an unsigned int because it isn't supposed to be less than 0. u32 is an OF specific data used to retrieve the value and has no need to be propagated up to the tree index. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index fa1c21ab..e543329 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,7 +116,7 @@ struct dsa_switch_tree { struct raw_notifier_head nh; /* Tree identifier */ - u32 tree; + unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; -- cgit v1.1 From 1f2556916d974cfb62b6af51660186b5f58bd869 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Fri, 3 Nov 2017 16:38:48 -0700 Subject: tcp: higher throughput under reordering with adaptive RACK reordering wnd Currently TCP RACK loss detection does not work well if packets are being reordered beyond its static reordering window (min_rtt/4).Under such reordering it may falsely trigger loss recoveries and reduce TCP throughput significantly. This patch improves that by increasing and reducing the reordering window based on DSACK, which is now supported in major TCP implementations. It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries. - If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded by srtt), since there is possibility that spurious retransmission was due to reordering delay longer than reo_wnd. - Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) no. of successful recoveries (accounts for full DSACK-based loss recovery undo). After that, reset it to default (min_rtt/4). - At max, reo_wnd is incremented only once per rtt. So that the new DSACK on which we are reacting, is due to the spurious retx (approx) after the reo_wnd has been updated last time. - reo_wnd is tracked in terms of steps (of min_rtt/4), rather than absolute value to account for change in rtt. In our internal testing, we observed significant increase in throughput, in scenarios where reordering exceeds min_rtt/4 (previous static value). Signed-off-by: Priyaranjan Jha Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 9 +++++++-- include/net/tcp.h | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8c43138..22f40c9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -210,8 +210,13 @@ struct tcp_sock { u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ - u8 advanced; /* mstamp advanced since last lost marking */ - u8 reord; /* reordering detected */ + u32 last_delivered; /* tp->delivered at last reo_wnd adj */ + u8 reo_wnd_steps; /* Allowed reordering window */ +#define TCP_RACK_RECOVERY_THRESH 16 + u8 reo_wnd_persist:5, /* No. of recovery since last adj */ + dsack_seen:1, /* Whether DSACK seen after last adj */ + advanced:1, /* mstamp advanced since last lost marking */ + reord:1; /* reordering detected */ } rack; u16 advmss; /* Advertised MSS */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */ diff --git a/include/net/tcp.h b/include/net/tcp.h index c2bf2a8..babfd4d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ +#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); +extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) -- cgit v1.1 From ecf8fecb7828648cba0e42de7464a7e600c93459 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:31 -0500 Subject: device_cgroup: prepare code for bpf-based device controller This is non-functional change to prepare the device cgroup code for adding eBPF-based controller for cgroups v2. The patch performs the following changes: 1) __devcgroup_inode_permission() and devcgroup_inode_mknod() are moving to the device-cgroup.h and converting into static inline. 2) __devcgroup_check_permission() is exported. 3) devcgroup_check_permission() wrapper is introduced to be used by both existing and new bpf-based implementations. Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/device_cgroup.h | 61 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index cdbc344..2d93d7e 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -1,17 +1,70 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#define DEVCG_ACC_MKNOD 1 +#define DEVCG_ACC_READ 2 +#define DEVCG_ACC_WRITE 4 +#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) + +#define DEVCG_DEV_BLOCK 1 +#define DEVCG_DEV_CHAR 2 +#define DEVCG_DEV_ALL 4 /* this represents all devices */ + +#ifdef CONFIG_CGROUP_DEVICE +extern int __devcgroup_check_permission(short type, u32 major, u32 minor, + short access); +#else +static inline int __devcgroup_check_permission(short type, u32 major, u32 minor, + short access) +{ return 0; } +#endif + #ifdef CONFIG_CGROUP_DEVICE -extern int __devcgroup_inode_permission(struct inode *inode, int mask); -extern int devcgroup_inode_mknod(int mode, dev_t dev); +static inline int devcgroup_check_permission(short type, u32 major, u32 minor, + short access) +{ + return __devcgroup_check_permission(type, major, minor, access); +} + static inline int devcgroup_inode_permission(struct inode *inode, int mask) { + short type, access = 0; + if (likely(!inode->i_rdev)) return 0; - if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) + + if (S_ISBLK(inode->i_mode)) + type = DEVCG_DEV_BLOCK; + else if (S_ISCHR(inode->i_mode)) + type = DEVCG_DEV_CHAR; + else return 0; - return __devcgroup_inode_permission(inode, mask); + + if (mask & MAY_WRITE) + access |= DEVCG_ACC_WRITE; + if (mask & MAY_READ) + access |= DEVCG_ACC_READ; + + return devcgroup_check_permission(type, imajor(inode), iminor(inode), + access); } + +static inline int devcgroup_inode_mknod(int mode, dev_t dev) +{ + short type; + + if (!S_ISBLK(mode) && !S_ISCHR(mode)) + return 0; + + if (S_ISBLK(mode)) + type = DEVCG_DEV_BLOCK; + else + type = DEVCG_DEV_CHAR; + + return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), + DEVCG_ACC_MKNOD); +} + #else static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } -- cgit v1.1 From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:32 -0500 Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2 Cgroup v2 lacks the device controller, provided by cgroup v1. This patch adds a new eBPF program type, which in combination of previously added ability to attach multiple eBPF programs to a cgroup, will provide a similar functionality, but with some additional flexibility. This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type. A program takes major and minor device numbers, device type (block/character) and access type (mknod/read/write) as parameters and returns an integer which defines if the operation should be allowed or terminated with -EPERM. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 15 +++++++++++++++ include/linux/bpf_types.h | 3 +++ include/linux/device_cgroup.h | 8 +++++++- include/uapi/linux/bpf.h | 15 +++++++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 87a7db9..a7f16e0 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, } \ __ret; \ }) + +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + access, \ + BPF_CGROUP_DEVICE); \ + \ + __ret; \ +}) #else struct cgroup_bpf {}; @@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 53c5b9a..978c1d9 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 2d93d7e..8557efe 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 @@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor, { return 0; } #endif -#ifdef CONFIG_CGROUP_DEVICE +#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { + int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); + + if (rc) + return -EPERM; + return __devcgroup_check_permission(type, major, minor, access); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4455dd1..e880ae6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, }; enum bpf_attach_type { @@ -141,6 +142,7 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, __MAX_BPF_ATTACH_TYPE }; @@ -991,4 +993,17 @@ struct bpf_perf_event_value { __u64 running; }; +#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) +#define BPF_DEVCG_ACC_READ (1ULL << 1) +#define BPF_DEVCG_ACC_WRITE (1ULL << 2) + +#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) +#define BPF_DEVCG_DEV_CHAR (1ULL << 1) + +struct bpf_cgroup_dev_ctx { + __u32 access_type; /* (access << 16) | type */ + __u32 major; + __u32 minor; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.1 From 5caaed151a68ae36aca2981cc245f5960a0a7603 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Nov 2017 19:41:09 +0100 Subject: netfilter: conntrack: don't cache nlattr_tuple_size result in nla_size We currently call ->nlattr_tuple_size() once at register time and cache result in l4proto->nla_size. nla_size is the only member that is written to, avoiding this would allow to make l4proto trackers const. We can use ->nlattr_tuple_size() at run time, and cache result in the individual trackers instead. This is an intermediate step, next patch removes nlattr_size() callback and computes size at compile time, then removes nla_size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index e065188..46e786f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -74,7 +74,7 @@ struct nf_conntrack_l4proto { int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ - int (*nlattr_tuple_size)(void); + unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t); const struct nla_policy *nla_policy; @@ -144,7 +144,7 @@ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t); -int nf_ct_port_nlattr_tuple_size(void); +unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -- cgit v1.1 From ba0e4d9917b43dfa746cbbcb4477da59aae73bd6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 9 Oct 2017 19:52:28 +0200 Subject: netfilter: nf_tables: get set elements via netlink This patch adds a new get operation to look up for specific elements in a set via netlink interface. You can also use it to check if an interval already exists. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0f5b12a..d011e56 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -311,6 +311,7 @@ struct nft_expr; * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elemeennts + * @get: get set elements * @privsize: function to return size of set private data * @init: initialize private data of new set instance * @destroy: destroy private data of set instance @@ -350,6 +351,10 @@ struct nft_set_ops { void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); + void * (*get)(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem, + unsigned int flags); unsigned int (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); -- cgit v1.1 From 3928ee6485a316c8abde7e24c7f82033a1c8d3ae Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Thu, 2 Nov 2017 00:49:18 +0100 Subject: net: phy: leds: Add support for "link" trigger Currently, we create a LED trigger for any link speed known to a PHY. These triggers only fire when their exact link speed had been negotiated (they aren't cumulative, that is, they don't fire for "their or any higher" link speed). What we are missing, however, is a trigger which will fire on any link speed known to the PHY. Such trigger can then be used for implementing a poor man's substitute of the "link" LED on boards that lack it. Let's add it. Signed-off-by: Maciej S. Szmigiero Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d78cd01..dc82a07 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -451,6 +451,8 @@ struct phy_device { struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; struct phy_led_trigger *last_triggered; + + struct phy_led_trigger *led_link_trigger; #endif /* -- cgit v1.1 From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:23 -0800 Subject: ila: add checksum neutral map auto Add checksum neutral auto that performs checksum neutral mapping without using the C-bit. This is enabled by configuration of a mapping. The checksum neutral function has been split into ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former handles the C-bit and includes it in the adjustment value. The latter just sets the adjustment value on the locator diff only. Added configuration for checksum neutral map aut in ila_lwt and ila_xlat. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index f548532..0744881 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -41,6 +41,7 @@ enum { ILA_CSUM_ADJUST_TRANSPORT, ILA_CSUM_NEUTRAL_MAP, ILA_CSUM_NO_ACTION, + ILA_CSUM_NEUTRAL_MAP_AUTO, }; #endif /* _UAPI_LINUX_ILA_H */ -- cgit v1.1 From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:24 -0800 Subject: ila: allow configuration of identifier type Allow identifier to be explicitly configured for a mapping. This can either be one of the identifier types specified in the ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the identifier type is inferred from the identifier type field. If a value other than ILA_ATYPE_USE_FORMAT is set for a mapping then it is assumed that the identifier type field is not present in an identifier. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 0744881..8353c78 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -17,6 +17,7 @@ enum { ILA_ATTR_DIR, /* u32 */ ILA_ATTR_PAD, ILA_ATTR_CSUM_MODE, /* u8 */ + ILA_ATTR_IDENT_TYPE, /* u8 */ __ILA_ATTR_MAX, }; @@ -44,4 +45,16 @@ enum { ILA_CSUM_NEUTRAL_MAP_AUTO, }; +enum { + ILA_ATYPE_IID = 0, + ILA_ATYPE_LUID, + ILA_ATYPE_VIRT_V4, + ILA_ATYPE_VIRT_UNI_V6, + ILA_ATYPE_VIRT_MULTI_V6, + ILA_ATYPE_NONLOCAL_ADDR, + ILA_ATYPE_RSVD_1, + ILA_ATYPE_RSVD_2, + + ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */ +}; #endif /* _UAPI_LINUX_ILA_H */ -- cgit v1.1 From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:25 -0800 Subject: ila: Add a hook type for LWT routes In LWT tunnels both an input and output route method is defined. If both of these are executed in the same path then double translation happens and the effect is not correct. This patch adds a new attribute that indicates the hook type. Two values are defined for route output and route output. ILA translation is only done for the one that is set. The default is to enable ILA on route output. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 8353c78..483b77af 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -18,6 +18,7 @@ enum { ILA_ATTR_PAD, ILA_ATTR_CSUM_MODE, /* u8 */ ILA_ATTR_IDENT_TYPE, /* u8 */ + ILA_ATTR_HOOK_TYPE, /* u8 */ __ILA_ATTR_MAX, }; @@ -57,4 +58,10 @@ enum { ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */ }; + +enum { + ILA_HOOK_ROUTE_OUTPUT, + ILA_HOOK_ROUTE_INPUT, +}; + #endif /* _UAPI_LINUX_ILA_H */ -- cgit v1.1 From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:41 +0100 Subject: net_sch: red: Add offload ability to RED qdisc Add the ability to offload RED qdisc by using ndo_setup_tc. There are four commands for RED offloading: * TC_RED_SET: handles set and change. * TC_RED_DESTROY: handle qdisc destroy. * TC_RED_STATS: update the qdiscs counters (given as reference) * TC_RED_XSTAT: returns red xstats. Whether RED is being offloaded is being determined every time dump action is being called because parent change of this qdisc could change its offload state but doesn't require any RED function to be called. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/pkt_sched.h | 1 + 3 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fda527c..71968a2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -777,6 +777,7 @@ enum tc_setup_type { TC_SETUP_CLSBPF, TC_SETUP_BLOCK, TC_SETUP_CBS, + TC_SETUP_QDISC_RED, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 98fef32..03c208d 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -703,4 +703,34 @@ struct tc_cookie { u8 *data; u32 len; }; + +enum tc_red_command { + TC_RED_REPLACE, + TC_RED_DESTROY, + TC_RED_STATS, + TC_RED_XSTATS, +}; + +struct tc_red_qopt_offload_params { + u32 min; + u32 max; + u32 probability; + bool is_ecn; +}; +struct tc_red_qopt_offload_stats { + struct gnet_stats_basic_packed *bstats; + struct gnet_stats_queue *qstats; +}; + +struct tc_red_qopt_offload { + enum tc_red_command command; + u32 handle; + u32 parent; + union { + struct tc_red_qopt_offload_params set; + struct tc_red_qopt_offload_stats stats; + struct red_stats *xstats; + }; +}; + #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 5002562..6a2c5ea 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -256,6 +256,7 @@ struct tc_red_qopt { #define TC_RED_ECN 1 #define TC_RED_HARDDROP 2 #define TC_RED_ADAPTATIVE 4 +#define TC_RED_OFFLOADED 8 }; struct tc_red_xstats { -- cgit v1.1 From 575ed7d39e2fbe602a3894bc766a8cb49af83bd3 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:42 +0100 Subject: net_sch: mqprio: Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO to match the new convention. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 71968a2..703885a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -770,7 +770,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); enum tc_setup_type { - TC_SETUP_MQPRIO, + TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, TC_SETUP_CLSMATCHALL, -- cgit v1.1 From 8521db4c7e155d12fb280686c0552e47f77e9110 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:43 +0100 Subject: net_sch: cbs: Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS to match the new convention.. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 703885a..30f0f29 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -776,7 +776,7 @@ enum tc_setup_type { TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, TC_SETUP_BLOCK, - TC_SETUP_CBS, + TC_SETUP_QDISC_CBS, TC_SETUP_QDISC_RED, }; -- cgit v1.1 From 620a5c860b774a81ce3f193eefb52bf4d128cca5 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Mon, 6 Nov 2017 12:42:01 +0100 Subject: net: dsa: lan9303: Correct register names in comments Two comments refer to registers, but lack the LAN9303_ prefix. Fix that. Signed-off-by: Egil Hjelmeland Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/linux/dsa/lan9303.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h index 05d8d13..f48a85c 100644 --- a/include/linux/dsa/lan9303.h +++ b/include/linux/dsa/lan9303.h @@ -13,8 +13,8 @@ struct lan9303_phy_ops { #define LAN9303_NUM_ALR_RECORDS 512 struct lan9303_alr_cache_entry { u8 mac_addr[ETH_ALEN]; - u8 port_map; /* Bitmap of ports. Zero if unused entry */ - u8 stp_override; /* non zero if set ALR_DAT1_AGE_OVERRID */ + u8 port_map; /* Bitmap of ports. Zero if unused entry */ + u8 stp_override; /* non zero if set LAN9303_ALR_DAT1_AGE_OVERRID */ }; struct lan9303 { @@ -28,7 +28,9 @@ struct lan9303 { struct mutex indirect_mutex; /* protect indexed register access */ const struct lan9303_phy_ops *ops; bool is_bridged; /* true if port 1 and 2 are bridged */ - u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */ + + /* remember LAN9303_SWE_PORT_STATE while not bridged */ + u32 swe_port_state; /* LAN9303 do not offer reading specific ALR entry. Cache all * static entries in a flat table **/ -- cgit v1.1 From 96c623e51f1c40bf524decc48c6fac7ce5dd41f7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 6 Nov 2017 14:26:10 +0100 Subject: of: add of_property_read_variable_* dummy helpers Commit a67e9472da42 ("of: Add array read functions with min/max size limits") added a new interface for reading variable-length arrays from DT properties. One user was added in dsa recently and this causes a build error because that code can be built with CONFIG_OF disabled: net/dsa/dsa2.c: In function 'dsa_switch_parse_member_of': net/dsa/dsa2.c:678:7: error: implicit declaration of function 'of_property_read_variable_u32_array'; did you mean 'of_property_read_u32_array'? [-Werror=implicit-function-declaration] This adds a dummy functions for of_property_read_variable_u32_array() and a few others that had been missing here. I decided to move of_property_read_string() and of_property_read_string_helper() in the process to make it easier to compare the two sets of function prototypes to make sure they match. Fixes: 975e6e32215e ("net: dsa: rework switch parsing") Signed-off-by: Arnd Bergmann Acked-by: Rob Herring Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/linux/of.h | 62 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index b240ed6..b32d418 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -675,12 +675,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np, return -ENOSYS; } -static inline int of_property_read_u32_index(const struct device_node *np, - const char *propname, u32 index, u32 *out_value) -{ - return -ENOSYS; -} - static inline int of_property_read_u8_array(const struct device_node *np, const char *propname, u8 *out_values, size_t sz) { @@ -707,16 +701,14 @@ static inline int of_property_read_u64_array(const struct device_node *np, return -ENOSYS; } -static inline int of_property_read_string(const struct device_node *np, - const char *propname, - const char **out_string) +static inline int of_property_read_u32_index(const struct device_node *np, + const char *propname, u32 index, u32 *out_value) { return -ENOSYS; } -static inline int of_property_read_string_helper(const struct device_node *np, - const char *propname, - const char **out_strs, size_t sz, int index) +static inline int of_property_read_u64_index(const struct device_node *np, + const char *propname, u32 index, u64 *out_value) { return -ENOSYS; } @@ -744,12 +736,51 @@ static inline int of_n_size_cells(struct device_node *np) return 0; } +static inline int of_property_read_variable_u8_array(const struct device_node *np, + const char *propname, u8 *out_values, + size_t sz_min, size_t sz_max) +{ + return -ENOSYS; +} + +static inline int of_property_read_variable_u16_array(const struct device_node *np, + const char *propname, u16 *out_values, + size_t sz_min, size_t sz_max) +{ + return -ENOSYS; +} + +static inline int of_property_read_variable_u32_array(const struct device_node *np, + const char *propname, + u32 *out_values, + size_t sz_min, + size_t sz_max) +{ + return -ENOSYS; +} + static inline int of_property_read_u64(const struct device_node *np, const char *propname, u64 *out_value) { return -ENOSYS; } +static inline int of_property_read_variable_u64_array(const struct device_node *np, + const char *propname, + u64 *out_values, + size_t sz_min, + size_t sz_max) +{ + return -ENOSYS; +} + +static inline int of_property_read_string(const struct device_node *np, + const char *propname, + const char **out_string) +{ + return -ENOSYS; +} + static inline int of_property_match_string(const struct device_node *np, const char *propname, const char *string) @@ -757,6 +788,13 @@ static inline int of_property_match_string(const struct device_node *np, return -ENOSYS; } +static inline int of_property_read_string_helper(const struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index) +{ + return -ENOSYS; +} + static inline struct device_node *of_parse_phandle(const struct device_node *np, const char *phandle_name, int index) -- cgit v1.1 From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001 From: Yi Yang Date: Tue, 7 Nov 2017 21:07:02 +0800 Subject: openvswitch: enable NSH support v16->17 - Fixed disputed check code: keep them in nsh_push and nsh_pop but also add them in __ovs_nla_copy_actions v15->v16 - Add csum recalculation for nsh_push, nsh_pop and set_nsh pointed out by Pravin - Move nsh key into the union with ipv4 and ipv6 and add check for nsh key in match_validate pointed out by Pravin - Add nsh check in validate_set and __ovs_nla_copy_actions v14->v15 - Check size in nsh_hdr_from_nlattr - Fixed four small issues pointed out By Jiri and Eric v13->v14 - Rename skb_push_nsh to nsh_push per Dave's comment - Rename skb_pop_nsh to nsh_pop per Dave's comment v12->v13 - Fix NSH header length check in set_nsh v11->v12 - Fix missing changes old comments pointed out - Fix new comments for v11 v10->v11 - Fix the left three disputable comments for v9 but not fixed in v10. v9->v10 - Change struct ovs_key_nsh to struct ovs_nsh_key_base base; __be32 context[NSH_MD1_CONTEXT_SIZE]; - Fix new comments for v9 v8->v9 - Fix build error reported by daily intel build because nsh module isn't selected by openvswitch v7->v8 - Rework nested value and mask for OVS_KEY_ATTR_NSH - Change pop_nsh to adapt to nsh kernel module - Fix many issues per comments from Jiri Benc v6->v7 - Remove NSH GSO patches in v6 because Jiri Benc reworked it as another patch series and they have been merged. - Change it to adapt to nsh kernel module added by NSH GSO patch series v5->v6 - Fix the rest comments for v4. - Add NSH GSO support for VxLAN-gpe + NSH and Eth + NSH. v4->v5 - Fix many comments by Jiri Benc and Eric Garver for v4. v3->v4 - Add new NSH match field ttl - Update NSH header to the latest format which will be final format and won't change per its author's confirmation. - Fix comments for v3. v2->v3 - Change OVS_KEY_ATTR_NSH to nested key to handle length-fixed attributes and length-variable attriubte more flexibly. - Remove struct ovs_action_push_nsh completely - Add code to handle nested attribute for SET_MASKED - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH to transfer NSH header data. - Fix comments and coding style issues by Jiri and Eric v1->v2 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh - Dynamically allocate struct ovs_action_push_nsh for length-variable metadata. OVS master and 2.8 branch has merged NSH userspace patch series, this patch is to enable NSH support in kernel data path in order that OVS can support NSH in compat mode by porting this. Signed-off-by: Yi Yang Acked-by: Jiri Benc Acked-by: Eric Garver Acked-by: Pravin Shelar Signed-off-by: David S. Miller --- include/net/nsh.h | 3 +++ include/uapi/linux/openvswitch.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/nsh.h b/include/net/nsh.h index a1eaea2..350b1ad 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -304,4 +304,7 @@ static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags, NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK); } +int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh); +int nsh_pop(struct sk_buff *skb); + #endif /* __NET_NSH_H */ diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 501e4c4..ec75a68 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -336,6 +336,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_CT_LABELS, /* 16-octet connection tracking label */ OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, /* struct ovs_key_ct_tuple_ipv4 */ OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, /* struct ovs_key_ct_tuple_ipv6 */ + OVS_KEY_ATTR_NSH, /* Nested set of ovs_nsh_key_* */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ @@ -495,6 +496,30 @@ struct ovs_key_ct_tuple_ipv6 { __u8 ipv6_proto; }; +enum ovs_nsh_key_attr { + OVS_NSH_KEY_ATTR_UNSPEC, + OVS_NSH_KEY_ATTR_BASE, /* struct ovs_nsh_key_base. */ + OVS_NSH_KEY_ATTR_MD1, /* struct ovs_nsh_key_md1. */ + OVS_NSH_KEY_ATTR_MD2, /* variable-length octets for MD type 2. */ + __OVS_NSH_KEY_ATTR_MAX +}; + +#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1) + +struct ovs_nsh_key_base { + __u8 flags; + __u8 ttl; + __u8 mdtype; + __u8 np; + __be32 path_hdr; +}; + +#define NSH_MD1_CONTEXT_SIZE 4 + +struct ovs_nsh_key_md1 { + __be32 context[NSH_MD1_CONTEXT_SIZE]; +}; + /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow @@ -811,6 +836,8 @@ struct ovs_action_push_eth { * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the * packet. * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. + * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet. + * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -841,6 +868,8 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */ OVS_ACTION_ATTR_POP_ETH, /* No argument. */ OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ + OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */ + OVS_ACTION_ATTR_POP_NSH, /* No argument. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit v1.1 From 24a9332a58b7f41a0d36c35a2c6897242bffdbc0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:43 -0500 Subject: net: dsa: constify cpu_dp member of dsa_port A DSA port has a dedicated CPU port assigned to it, stored in the cpu_dp member. It is not meant to be modified by a port, thus make it const. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e543329..2a8613b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -190,7 +190,7 @@ struct dsa_port { struct dsa_switch *ds; unsigned int index; const char *name; - struct dsa_port *cpu_dp; + const struct dsa_port *cpu_dp; struct device_node *dn; unsigned int ageing_time; u8 stp_state; -- cgit v1.1 From ec15dd4269d0cbf947c9a2dfdcf08a917098fab1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:46 -0500 Subject: net: dsa: setup and teardown tree This commit provides better scope for the DSA tree setup and teardown functions. It renames the "applied" bool to "setup" and print a message when the tree is setup, as it is done during teardown. At the same time, check dst->setup in dsa_tree_setup, where it is set to true. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2a8613b..6c23925 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -122,7 +122,7 @@ struct dsa_switch_tree { struct kref refcount; /* Has this tree been applied to the hardware? */ - bool applied; + bool setup; /* * Configuration data for the platform device that owns -- cgit v1.1 From 375ef2b1f0d0b43b0d36ffdd521637ff59b0c13c Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 17 Sep 2017 13:43:58 +0300 Subject: net: Introduce netdev_*_once functions Extend the net device error logging with netdev_*_once macros. netdev_*_once are the equivalents of the dev_*_once macros which are useful for messages that should only be logged once. Also add netdev_WARN_ONCE, which is the "once" extension for the already existing netdev_WARN macro. Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 30f0f29..79518ed 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4336,6 +4336,31 @@ void netdev_notice(const struct net_device *dev, const char *format, ...); __printf(2, 3) void netdev_info(const struct net_device *dev, const char *format, ...); +#define netdev_level_once(level, dev, fmt, ...) \ +do { \ + static bool __print_once __read_mostly; \ + \ + if (!__print_once) { \ + __print_once = true; \ + netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +#define netdev_emerg_once(dev, fmt, ...) \ + netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) +#define netdev_alert_once(dev, fmt, ...) \ + netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) +#define netdev_crit_once(dev, fmt, ...) \ + netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) +#define netdev_err_once(dev, fmt, ...) \ + netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) +#define netdev_warn_once(dev, fmt, ...) \ + netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) +#define netdev_notice_once(dev, fmt, ...) \ + netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) +#define netdev_info_once(dev, fmt, ...) \ + netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) + #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) @@ -4376,6 +4401,10 @@ do { \ WARN(1, "netdevice: %s%s\n" format, netdev_name(dev), \ netdev_reg_state(dev), ##args) +#define netdev_WARN_ONCE(dev, condition, format, arg...) \ + WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev) \ + netdev_reg_state(dev), ##args) + /* netif printk helpers, similar to netdev_printk */ #define netif_printk(priv, type, level, dev, fmt, args...) \ -- cgit v1.1 From 4382c7b92a1db397874ca62c73aa8b023af6dba8 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 10 Sep 2017 13:22:51 +0300 Subject: net/mlx5e: Add 802.1ad VLAN insertion support Report VLAN insertion support for S-tagged packets and add support by choosing the correct VLAN type in the WQE. Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 66d19b6..62af751 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -221,6 +221,7 @@ enum { }; enum { + MLX5_ETH_WQE_SVLAN = 1 << 0, MLX5_ETH_WQE_INSERT_VLAN = 1 << 15, }; -- cgit v1.1 From 4d63adfe12dd9cb61ed8badb4d798955399048c2 Mon Sep 17 00:00:00 2001 From: Mark Greer Date: Thu, 15 Jun 2017 20:34:22 -0700 Subject: NFC: Add NFC_CMD_DEACTIVATE_TARGET support Once an NFC target (i.e., a tag) is found, it remains active until there is a failure reading or writing it (often caused by the target moving out of range). While the target is active, the NFC adapter and antenna must remain powered. This wastes power when the target remains in range but the client application no longer cares whether it is there or not. To mitigate this, add a new netlink command that allows userspace to deactivate an active target. When issued, this command will cause the NFC subsystem to act as though the target was moved out of range. Once the command has been executed, the client application can power off the NFC adapter to reduce power consumption. Signed-off-by: Mark Greer Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 399f39f..f6e3c8c 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -89,6 +89,7 @@ * @NFC_CMD_ACTIVATE_TARGET: Request NFC controller to reactivate target. * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly * from the driver in order to support hardware specific operations. + * @NFC_CMD_DEACTIVATE_TARGET: Request NFC controller to deactivate target. */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -121,6 +122,7 @@ enum nfc_commands { NFC_CMD_SE_IO, NFC_CMD_ACTIVATE_TARGET, NFC_CMD_VENDOR, + NFC_CMD_DEACTIVATE_TARGET, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; -- cgit v1.1 From 47d5b6db2afa766d7af85db684d0b5f092e4fc46 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:10:59 +0100 Subject: net: bridge: Add/del switchdev object on host join/leave When the host joins or leaves a multicast group, use switchdev to add an object to the hardware to forward traffic for the group to the host. Signed-off-by: Andrew Lunn Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d756fbe..39bc855 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -76,6 +76,7 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_UNDEFINED, SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, + SWITCHDEV_OBJ_ID_HOST_MDB, }; struct switchdev_obj { -- cgit v1.1 From a3dcaf17ee54f1d01d22cc2b22cab0b4f60d78cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:27 -0800 Subject: net: allow per netns sysctl_rmem and sysctl_wmem for protos As we want to gradually implement per netns sysctl_rmem and sysctl_wmem on per protocol basis, add two new fields in struct proto, and two new helpers : sk_get_wmem0() and sk_get_rmem0() First user will be TCP. Then UDP and SCTP can be easily converted, while DECNET probably wont get this support. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 22 ++++++++++++++++++++++ include/trace/events/sock.h | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 6f1be97..688a823 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1101,8 +1101,12 @@ struct proto { */ unsigned long *memory_pressure; long *sysctl_mem; + int *sysctl_wmem; int *sysctl_rmem; + u32 sysctl_wmem_offset; + u32 sysctl_rmem_offset; + int max_header; bool no_autobind; @@ -2390,4 +2394,22 @@ extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) +{ + /* Does this proto have per netns sysctl_wmem ? */ + if (proto->sysctl_wmem_offset) + return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + + return *proto->sysctl_wmem; +} + +static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) +{ + /* Does this proto have per netns sysctl_rmem ? */ + if (proto->sysctl_rmem_offset) + return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + + return *proto->sysctl_rmem; +} + #endif /* _SOCK_H */ diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 6d31c05..ec4dade 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -48,7 +48,7 @@ TRACE_EVENT(sock_exceed_buf_limit, strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem = prot->sysctl_mem; __entry->allocated = allocated; - __entry->sysctl_rmem = prot->sysctl_rmem[0]; + __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); ), -- cgit v1.1 From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:28 -0800 Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem Note that when a new netns is created, it inherits its sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns. This change is needed so that we can refine TCP rcvbuf autotuning, to take RTT into consideration. Signed-off-by: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 379550f..5e12975 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -155,6 +155,8 @@ struct netns_ipv4 { int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; + int sysctl_tcp_wmem[3]; + int sysctl_tcp_rmem[3]; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index babfd4d..2f2c69a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; -extern int sysctl_tcp_wmem[3]; -extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ -- cgit v1.1 From 4c5b9d9642c859f7369338fc42c0f62f4151bef3 Mon Sep 17 00:00:00 2001 From: Manish Kurup Date: Tue, 7 Nov 2017 15:49:05 -0500 Subject: act_vlan: VLAN action rewrite to use RCU lock/unlock and update Using a spinlock in the VLAN action causes performance issues when the VLAN action is used on multiple cores. Rewrote the VLAN action to use RCU read locking for reads and updates instead. All functions now use an RCU dereferenced pointer to access the VLAN action context. Modified helper functions used by other modules, to use the RCU as opposed to directly accessing the structure. Acked-by: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: Manish Kurup Signed-off-by: David S. Miller --- include/net/tc_act/tc_vlan.h | 46 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index c2090df..22ae260 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -13,12 +13,17 @@ #include #include +struct tcf_vlan_params { + int tcfv_action; + u16 tcfv_push_vid; + __be16 tcfv_push_proto; + u8 tcfv_push_prio; + struct rcu_head rcu; +}; + struct tcf_vlan { struct tc_action common; - int tcfv_action; - u16 tcfv_push_vid; - __be16 tcfv_push_proto; - u8 tcfv_push_prio; + struct tcf_vlan_params __rcu *vlan_p; }; #define to_vlan(a) ((struct tcf_vlan *)a) @@ -33,22 +38,45 @@ static inline bool is_tcf_vlan(const struct tc_action *a) static inline u32 tcf_vlan_action(const struct tc_action *a) { - return to_vlan(a)->tcfv_action; + u32 tcfv_action; + + rcu_read_lock(); + tcfv_action = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_action; + rcu_read_unlock(); + + return tcfv_action; } static inline u16 tcf_vlan_push_vid(const struct tc_action *a) { - return to_vlan(a)->tcfv_push_vid; + u16 tcfv_push_vid; + + rcu_read_lock(); + tcfv_push_vid = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_vid; + rcu_read_unlock(); + + return tcfv_push_vid; } static inline __be16 tcf_vlan_push_proto(const struct tc_action *a) { - return to_vlan(a)->tcfv_push_proto; + __be16 tcfv_push_proto; + + rcu_read_lock(); + tcfv_push_proto = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_proto; + rcu_read_unlock(); + + return tcfv_push_proto; } static inline u8 tcf_vlan_push_prio(const struct tc_action *a) { - return to_vlan(a)->tcfv_push_prio; -} + u8 tcfv_push_prio; + rcu_read_lock(); + tcfv_push_prio = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_prio; + rcu_read_unlock(); + + return tcfv_push_prio; +} #endif /* __NET_TC_VLAN_H */ -- cgit v1.1 From 54985120a1c461b74f9510e5d730971f2a2383b1 Mon Sep 17 00:00:00 2001 From: Girish Moodalbail Date: Tue, 7 Nov 2017 11:32:11 -0800 Subject: net: fix incorrect comment with regard to VLAN packet handling The commit bcc6d4790361 ("net: vlan: make non-hw-accel rx path similar to hw-accel") unified accel and non-accel path for VLAN RX. With that fix we do not register any packet_type handler for VLANs anymore, so fix the incorrect comment. Signed-off-by: Girish Moodalbail Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 79518ed..6b274bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4479,15 +4479,7 @@ do { \ * Why 16. Because with 16 the only overlap we get on a hash of the * low nibble of the protocol value is RARP/SNAP/X.25. * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversely affected. - * --BLG - * * 0800 IP - * 8100 802.1Q VLAN * 0001 802.3 * 0002 AX.25 * 0004 802.2 -- cgit v1.1 From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:42 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 3 ++- include/linux/trace_events.h | 1 + include/uapi/linux/bpf.h | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0cd02ff4..eaec066 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -459,7 +459,8 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + kprobe_override:1; /* Do we override a kprobe? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 84014ec..17e5e82 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -523,6 +523,7 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); +DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e880ae6..adb66f7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -677,6 +677,10 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code + * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -736,7 +740,8 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), + FN(getsockopt), \ + FN(override_return), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.1 From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 7 Nov 2017 21:52:09 -0800 Subject: net: ipv6: sysctl to specify IPv6 ND traffic class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti Signed-off-by: Erik Kline Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ea04ca0..cb18c62 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -73,6 +73,7 @@ struct ipv6_devconf { __u32 enhanced_dad; __u32 addr_gen_mode; __s32 disable_policy; + __s32 ndisc_tclass; struct ctl_table_header *sysctl_header; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index b22a9c4..9c0f4a9 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -186,6 +186,7 @@ enum { DEVCONF_ADDR_GEN_MODE, DEVCONF_DISABLE_POLICY, DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, + DEVCONF_NDISC_TCLASS, DEVCONF_MAX }; -- cgit v1.1 From 8d6e79d3ce13e34957de87f7584cbf1bcde74c57 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 8 Nov 2017 09:59:26 +0100 Subject: tipc: improve link resiliency when rps is activated Currently, the TIPC RPS dissector is based only on the incoming packets' source node address, hence steering all traffic from a node to the same core. We have seen that this makes the links vulnerable to starvation and unnecessary resets when we turn down the link tolerance to very low values. To reduce the risk of this happening, we exempt probe and probe replies packets from the convergence to one core per source node. Instead, we do the opposite, - we try to diverge those packets across as many cores as possible, by randomizing the flow selector key. To make such packets identifiable to the dissector, we add a new 'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is set both for PROBE and PROBE_REPLY messages, and only for those. It should be noted that these packets are not part of any flow anyway, and only constitute a minuscule fraction of all packets sent across a link. Hence, there is no risk that this will affect overall performance. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 12 ++++----- include/net/tipc.h | 62 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 include/net/tipc.h (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 22aba32..9a07477 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -84,11 +84,11 @@ struct flow_dissector_key_ipv6_addrs { }; /** - * struct flow_dissector_key_tipc_addrs: - * @srcnode: source node address + * struct flow_dissector_key_tipc: + * @key: source node address combined with selector */ -struct flow_dissector_key_tipc_addrs { - __be32 srcnode; +struct flow_dissector_key_tipc { + __be32 key; }; /** @@ -100,7 +100,7 @@ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; - struct flow_dissector_key_tipc_addrs tipcaddrs; + struct flow_dissector_key_tipc tipckey; }; }; @@ -192,7 +192,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ - FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */ + FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */ diff --git a/include/net/tipc.h b/include/net/tipc.h new file mode 100644 index 0000000..07670ec --- /dev/null +++ b/include/net/tipc.h @@ -0,0 +1,62 @@ +/* + * include/net/tipc.h: Include file for TIPC message header routines + * + * Copyright (c) 2017 Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_HDR_H +#define _TIPC_HDR_H + +#include + +#define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ + +struct tipc_basic_hdr { + __be32 w[4]; +}; + +static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) +{ + u32 w0 = ntohl(hdr->w[0]); + bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; + int key; + + /* Return source node identity as key */ + if (likely(!keepalive_msg)) + return hdr->w[3]; + + /* Spread PROBE/PROBE_REPLY messages across the cores */ + get_random_bytes(&key, sizeof(key)); + return key; +} + +#endif -- cgit v1.1 From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- include/linux/filter.h | 3 +-- include/linux/trace_events.h | 1 - include/uapi/linux/bpf.h | 7 +------ 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index eaec066..0cd02ff4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -459,8 +459,7 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1, /* Do we need dst entry? */ - kprobe_override:1; /* Do we override a kprobe? */ + dst_needed:1; /* Do we need dst entry? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 17e5e82..84014ec 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -523,7 +523,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index adb66f7..e880ae6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -677,10 +677,6 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code - * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -740,8 +736,7 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), \ - FN(override_return), + FN(getsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.1 From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 8 Nov 2017 13:01:26 -0800 Subject: tcp: retire FACK loss detection FACK loss detection has been disabled by default and the successor RACK subsumed FACK and can handle reordering better. This patch removes FACK to simplify TCP loss recovery. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 14 +------------- include/uapi/linux/snmp.h | 1 - 3 files changed, 1 insertion(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 22f40c9..9574936 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -85,7 +85,6 @@ struct tcp_sack_block { /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ -#define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ struct tcp_options_received { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2f2c69a..ed71511 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -384,7 +384,6 @@ void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); -void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); @@ -776,7 +775,7 @@ struct tcp_skb_cb { }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ - __u8 sacked; /* State flags for SACK/FACK. */ + __u8 sacked; /* State flags for SACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_LOST 0x04 /* SKB is lost */ @@ -1066,7 +1065,6 @@ void tcp_rate_check_app_limited(struct sock *sk); * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK - * tcp_is_fack - FACK enabled, implies SACK enabled */ static inline int tcp_is_sack(const struct tcp_sock *tp) { @@ -1078,16 +1076,6 @@ static inline bool tcp_is_reno(const struct tcp_sock *tp) return !tcp_is_sack(tp); } -static inline bool tcp_is_fack(const struct tcp_sock *tp) -{ - return tp->rx_opt.sack_ok & TCP_FACK_ENABLED; -} - -static inline void tcp_enable_fack(struct tcp_sock *tp) -{ - tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; -} - static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 0d941cd..33a70ec 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -191,7 +191,6 @@ enum LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ - LINUX_MIB_TCPFACKREORDER, /* TCPFACKReorder */ LINUX_MIB_TCPSACKREORDER, /* TCPSACKReorder */ LINUX_MIB_TCPRENOREORDER, /* TCPRenoReorder */ LINUX_MIB_TCPTSREORDER, /* TCPTSReorder */ -- cgit v1.1 From 737ff314563ca27f044f9a3a041e9d42491ef7ce Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 8 Nov 2017 13:01:27 -0800 Subject: tcp: use sequence distance to detect reordering Replace the reordering distance measurement in packet unit with sequence based approach. Previously it trackes the number of "packets" toward the forward ACK (i.e. highest sacked sequence)in a state variable "fackets_out". Precisely measuring reordering degree on packet distance has not much benefit, as the degree constantly changes by factors like path, load, and congestion window. It is also complicated and prone to arcane bugs. This patch replaces with sequence-based approach that's much simpler. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9574936..df5d97a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -293,7 +293,6 @@ struct tcp_sock { u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ - u32 fackets_out; /* FACK'd packets */ struct hrtimer pacing_timer; -- cgit v1.1 From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 10 Nov 2017 14:03:51 -0800 Subject: net: Remove unused skb_shared_info member ip6_frag_id was only used by UFO, which has been removed. ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no in-tree callers. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - include/net/ipv6.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 57d7126..54fe911 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -500,7 +500,6 @@ struct skb_shared_info { struct skb_shared_hwtstamps hwtstamps; unsigned int gso_type; u32 tskey; - __be32 ip6_frag_id; /* * Warning : all fields before dataref are cleared in __alloc_skb() diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fb6d670..ec14f0d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -767,7 +767,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); -- cgit v1.1 From 99803171ef04037092bf5eb29ae801e8b4d49a75 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:27 -0800 Subject: netem: add uapi to express delay and jitter in nanoseconds netem userspace has long relied on a horrible /proc/net/psched hack to translate the current notion of "ticks" to nanoseconds. Expressing latency and jitter instead, in well defined nanoseconds, increases the dynamic range of emulated delays and jitter in netem. It will also ease a transition where reducing a tick to nsec equivalence would constrain the max delay in prior versions of netem to only 4.3 seconds. Signed-off-by: Dave Taht Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 6a2c5ea..8fe6d18 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -537,6 +537,8 @@ enum { TCA_NETEM_ECN, TCA_NETEM_RATE64, TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, __TCA_NETEM_MAX, }; -- cgit v1.1 From 836af83b54e3e285c4a0cc06c24aeb737d3e0e18 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:28 -0800 Subject: netem: support delivering packets in delayed time slots Slotting is a crude approximation of the behaviors of shared media such as cable, wifi, and LTE, which gather up a bunch of packets within a varying delay window and deliver them, relative to that, nearly all at once. It works within the existing loss, duplication, jitter and delay parameters of netem. Some amount of inherent latency must be specified, regardless. The new "slot" parameter specifies a minimum and maximum delay between transmission attempts. The "bytes" and "packets" parameters can be used to limit the amount of information transferred per slot. Examples of use: tc qdisc add dev eth0 root netem delay 200us \ slot 800us 10ms bytes 64k packets 42 A more correct example, using stacked netem instances and a packet limit to emulate a tail drop wifi queue with slots and variable packet delivery, with a 200Mbit isochronous underlying rate, and 20ms path delay: tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \ limit 10000 tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \ slot 800us 10ms bytes 64k packets 42 limit 512 Signed-off-by: Dave Taht Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8fe6d18..af3cc2f 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -539,6 +539,7 @@ enum { TCA_NETEM_PAD, TCA_NETEM_LATENCY64, TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, __TCA_NETEM_MAX, }; @@ -576,6 +577,13 @@ struct tc_netem_rate { __s32 cell_overhead; }; +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; +}; + enum { NETEM_LOSS_UNSPEC, NETEM_LOSS_GI, /* General Intuitive - 4 state model */ -- cgit v1.1 From 5ed4e3eb021762fee584ce65620bc822131c7aa0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:52 -0800 Subject: net: dsa: Pass a port to get_tag_protocol() A number of drivers want to check whether the configured CPU port is a possible configuration for enabling tagging, pass down the CPU port number so they verify that. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6c23925..68e232f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -321,7 +321,8 @@ struct dsa_switch_ops { struct device *host_dev, int sw_addr, void **priv); - enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); + enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, + int port); int (*setup)(struct dsa_switch *ds); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); -- cgit v1.1 From b74b70c44986dee87881fbed3d912e02c5dcf78c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:54 -0800 Subject: net: dsa: Support prepended Broadcom tag Add a new type: DSA_TAG_PROTO_PREPEND which allows us to support for the 4-bytes Broadcom tag that we already support, but in a format where it is pre-pended to the packet instead of located between the MAC SA and the Ethertyper (DSA_TAG_PROTO_BRCM). Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 68e232f..2a05738 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -29,6 +29,7 @@ struct fixed_phy_status; enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, DSA_TAG_PROTO_BRCM, + DSA_TAG_PROTO_BRCM_PREPEND, DSA_TAG_PROTO_DSA, DSA_TAG_PROTO_EDSA, DSA_TAG_PROTO_KSZ, -- cgit v1.1 From 5794040647de4011598a6d005fdad95d24fd385b Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:40 -0800 Subject: openvswitch: Add meter netlink definitions Meter has its own netlink family. Define netlink messages and attributes for communicating with the user space programs. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index ec75a68..d60b9a4 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -883,4 +883,55 @@ enum ovs_action_attr { #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1) +/* Meters. */ +#define OVS_METER_FAMILY "ovs_meter" +#define OVS_METER_MCGROUP "ovs_meter" +#define OVS_METER_VERSION 0x1 + +enum ovs_meter_cmd { + OVS_METER_CMD_UNSPEC, + OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */ + OVS_METER_CMD_SET, /* Add or modify a meter. */ + OVS_METER_CMD_DEL, /* Delete a meter. */ + OVS_METER_CMD_GET /* Get meter stats. */ +}; + +enum ovs_meter_attr { + OVS_METER_ATTR_UNSPEC, + OVS_METER_ATTR_ID, /* u32 meter ID within datapath. */ + OVS_METER_ATTR_KBPS, /* No argument. If set, units in kilobits + * per second. Otherwise, units in + * packets per second. + */ + OVS_METER_ATTR_STATS, /* struct ovs_flow_stats for the meter. */ + OVS_METER_ATTR_BANDS, /* Nested attributes for meter bands. */ + OVS_METER_ATTR_USED, /* u64 msecs last used in monotonic time. */ + OVS_METER_ATTR_CLEAR, /* Flag to clear stats, used. */ + OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */ + OVS_METER_ATTR_MAX_BANDS, /* u32 max number of bands per meter. */ + OVS_METER_ATTR_PAD, + __OVS_METER_ATTR_MAX +}; + +#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1) + +enum ovs_band_attr { + OVS_BAND_ATTR_UNSPEC, + OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */ + OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */ + OVS_BAND_ATTR_BURST, /* u32 burst size in meter units. */ + OVS_BAND_ATTR_STATS, /* struct ovs_flow_stats for the band. */ + __OVS_BAND_ATTR_MAX +}; + +#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1) + +enum ovs_meter_band_type { + OVS_METER_BAND_TYPE_UNSPEC, + OVS_METER_BAND_TYPE_DROP, /* Drop exceeding packets. */ + __OVS_METER_BAND_TYPE_MAX +}; + +#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1) + #endif /* _LINUX_OPENVSWITCH_H */ -- cgit v1.1 From cd8a6c33693c1b89d2737ffdbf9611564e9ac907 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:43 -0800 Subject: openvswitch: Add meter action support Implements OVS kernel meter action support. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index d60b9a4..4265d7f 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -838,6 +838,8 @@ struct ovs_action_push_eth { * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet. * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet. + * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the + * packet, or modify the packet (e.g., change the DSCP field). * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -870,6 +872,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */ OVS_ACTION_ATTR_POP_NSH, /* No argument. */ + OVS_ACTION_ATTR_METER, /* u32 meter ID. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit v1.1 From 3a9b76fd0db9f0d426533f96a68a62a58753a51e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Nov 2017 15:54:12 -0800 Subject: tcp: allow drivers to tweak TSQ logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I had many reports that TSQ logic breaks wifi aggregation. Current logic is to allow up to 1 ms of bytes to be queued into qdisc and drivers queues. But Wifi aggregation needs a bigger budget to allow bigger rates to be discovered by various TCP Congestion Controls algorithms. This patch adds an extra socket field, allowing wifi drivers to select another log scale to derive TCP Small Queue credit from current pacing rate. Initial value is 10, meaning that this patch does not change current behavior. We expect wifi drivers to set this field to smaller values (tests have been done with values from 6 to 9) They would have to use following template : if (skb->sk && skb->sk->sk_pacing_shift != MY_PACING_SHIFT) skb->sk->sk_pacing_shift = MY_PACING_SHIFT; Ref: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1670041 Signed-off-by: Eric Dumazet Cc: Johannes Berg Cc: Toke Høiland-Jørgensen Cc: Kir Kolyshkin Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 688a823..f8715c5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -267,6 +267,7 @@ struct sock_common { * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments + * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct @@ -451,6 +452,7 @@ struct sock { kmemcheck_bitfield_end(flags); u16 sk_gso_max_segs; + u8 sk_pacing_shift; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; -- cgit v1.1 From 6d88207fcfddc002afe3e2e4a455e5201089d5d9 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:45 +0200 Subject: tls: Add function to update the TLS socket configuration The tx configuration is now stored in ctx->tx_conf. And sk->sk_prot is updated trough a function This will simplify things when we add rx and support for different possible tx and rx cross configurations. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index b89d397..f058a6e 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,8 @@ struct tls_context { void *priv_ctx; + u8 tx_conf:2; + u16 prepend_size; u16 tag_size; u16 overhead_size; -- cgit v1.1 From ff45d820a2df163957ad8ab459b6eb6976144c18 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:46 +0200 Subject: tls: Fix TLS ulp context leak, when TLS_TX setsockopt is not used. Previously the TLS ulp context would leak if we attached a TLS ulp to a socket but did not use the TLS_TX setsockopt, or did use it but it failed. This patch solves the issue by overriding prot[TLS_BASE_TX].close and fixing tls_sk_proto_close to work properly when its called with ctx->tx_conf == TLS_BASE_TX. This patch also removes ctx->free_resources as we can use ctx->tx_conf to obtain the relevant information. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index f058a6e..7cb58a6 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -99,7 +99,6 @@ struct tls_context { u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); - void (*free_resources)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); @@ -124,6 +123,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); +void tls_sw_free_tx_resources(struct sock *sk); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); -- cgit v1.1 From 213ef6e7c9c063c482d77f12cc438872628d48ec Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:47 +0200 Subject: tls: Move tls_make_aad to header to allow sharing move tls_make_aad as it is going to be reused by the device offload code and rx path. Remove unused recv parameter. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 7cb58a6..70becd0 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -214,6 +214,21 @@ static inline void tls_fill_prepend(struct tls_context *ctx, ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } +static inline void tls_make_aad(char *buf, + size_t size, + char *record_sequence, + int record_sequence_size, + unsigned char record_type) +{ + memcpy(buf, record_sequence, record_sequence_size); + + buf[8] = record_type; + buf[9] = TLS_1_2_VERSION_MAJOR; + buf[10] = TLS_1_2_VERSION_MINOR; + buf[11] = size >> 8; + buf[12] = size & 0xFF; +} + static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); -- cgit v1.1 From 096d1dd0f03211fb42d6c2457f248827604b7f0e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Nov 2017 16:19:46 +0100 Subject: netlink: remove unused NETLINK SKB flags These flags are unused, remove them to be less confusing. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/netlink.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6ddb4a5..49b4257 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -17,9 +17,6 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) } enum netlink_skb_flags { - NETLINK_SKB_MMAPED = 0x1, /* Packet data is mmaped */ - NETLINK_SKB_TX = 0x2, /* Packet was sent by userspace */ - NETLINK_SKB_DELIVERED = 0x4, /* Packet was delivered */ NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; -- cgit v1.1 From 0eef304bc9f7d079a1165e8cd2f24b078e9e1f2a Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 13 Nov 2017 03:37:06 +0300 Subject: uapi: fix linux/rxrpc.h userspace compilation errors Consistently use types provided by to fix the following linux/rxrpc.h userspace compilation errors: /usr/include/linux/rxrpc.h:24:2: error: unknown type name 'u16' u16 srx_service; /* service desired */ /usr/include/linux/rxrpc.h:25:2: error: unknown type name 'u16' u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ /usr/include/linux/rxrpc.h:26:2: error: unknown type name 'u16' u16 transport_len; /* length of transport address */ Use __kernel_sa_family_t instead of sa_family_t the same way as uapi/linux/in.h does, to fix the following linux/rxrpc.h userspace compilation errors: /usr/include/linux/rxrpc.h:23:2: error: unknown type name 'sa_family_t' sa_family_t srx_family; /* address family */ /usr/include/linux/rxrpc.h:28:3: error: unknown type name 'sa_family_t' sa_family_t family; /* transport address family */ Fixes: 727f8914477e ("rxrpc: Expose UAPI definitions to userspace") Cc: # v4.14 Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/uapi/linux/rxrpc.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 9656aad..9d4afea 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -20,12 +20,12 @@ * RxRPC socket address */ struct sockaddr_rxrpc { - sa_family_t srx_family; /* address family */ - u16 srx_service; /* service desired */ - u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ - u16 transport_len; /* length of transport address */ + __kernel_sa_family_t srx_family; /* address family */ + __u16 srx_service; /* service desired */ + __u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ + __u16 transport_len; /* length of transport address */ union { - sa_family_t family; /* transport address family */ + __kernel_sa_family_t family; /* transport address family */ struct sockaddr_in sin; /* IPv4 transport address */ struct sockaddr_in6 sin6; /* IPv6 transport address */ } transport; -- cgit v1.1 From b9f3eb499d84f8d4adcb2f9212ec655700b28228 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 14 Nov 2017 06:30:11 +0300 Subject: uapi: fix linux/tls.h userspace compilation error Move inclusion of a private kernel header from uapi/linux/tls.h to its only user - net/tls.h, to fix the following linux/tls.h userspace compilation error: /usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory As to this point uapi/linux/tls.h was totaly unusuable for userspace, cleanup this header file further by moving other redundant includes to net/tls.h. Fixes: 3c4d7559159b ("tls: kernel TLS support") Cc: # v4.13+ Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++++ include/uapi/linux/tls.h | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 70becd0..936cfc5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -35,6 +35,10 @@ #define _TLS_OFFLOAD_H #include +#include +#include +#include +#include #include diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index d5e0682..293b2cd 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -35,10 +35,6 @@ #define _UAPI_LINUX_TLS_H #include -#include -#include -#include -#include /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ -- cgit v1.1 From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5e12975..44668c2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; unsigned int sysctl_tcp_fastopen_blackhole_timeout; diff --git a/include/net/tcp.h b/include/net/tcp.h index ed71511..35cc7d0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); -int tcp_set_default_congestion_control(const char *name); -void tcp_get_default_congestion_control(char *name); +int tcp_set_default_congestion_control(struct net *net, const char *name); +void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); @@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else -- cgit v1.1 From 50895b9de1d3e0258e015e8e55128d835d9a9f19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Nov 2017 21:02:19 -0800 Subject: tcp: highest_sack fix syzbot easily found a regression added in our latest patches [1] No longer set tp->highest_sack to the head of the send queue since this is not logical and error prone. Only sack processing should maintain the pointer to an skb from rtx queue. We might in the future only remember the sequence instead of a pointer to skb, since rb-tree should allow a fast lookup. [1] BUG: KASAN: use-after-free in tcp_highest_sack_seq include/net/tcp.h:1706 [inline] BUG: KASAN: use-after-free in tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537 Read of size 4 at addr ffff8801c154faa8 by task syz-executor4/12860 CPU: 0 PID: 12860 Comm: syz-executor4 Not tainted 4.14.0-next-20171113+ #41 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 tcp_highest_sack_seq include/net/tcp.h:1706 [inline] tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2778 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048 __sys_sendmsg+0xe5/0x210 net/socket.c:2082 SYSC_sendmsg net/socket.c:2093 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2089 entry_SYSCALL_64_fastpath+0x1f/0x96 RIP: 0033:0x452879 RSP: 002b:00007fc9761bfbe8 EFLAGS: 00000212 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452879 RDX: 0000000000000000 RSI: 0000000020917fc8 RDI: 0000000000000015 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006ee3a0 R13: 00000000ffffffff R14: 00007fc9761c06d4 R15: 0000000000000000 Allocated by task 12860: save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc_node+0x144/0x760 mm/slab.c:3638 __alloc_skb+0xf1/0x780 net/core/skbuff.c:193 alloc_skb_fclone include/linux/skbuff.h:1023 [inline] sk_stream_alloc_skb+0x11d/0x900 net/ipv4/tcp.c:870 tcp_sendmsg_locked+0x1341/0x3b80 net/ipv4/tcp.c:1299 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1461 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 SYSC_sendto+0x358/0x5a0 net/socket.c:1749 SyS_sendto+0x40/0x50 net/socket.c:1717 entry_SYSCALL_64_fastpath+0x1f/0x96 Freed by task 12860: save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3492 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3750 kfree_skbmem+0xdd/0x1d0 net/core/skbuff.c:603 __kfree_skb+0x1d/0x20 net/core/skbuff.c:642 sk_wmem_free_skb include/net/sock.h:1419 [inline] tcp_rtx_queue_unlink_and_free include/net/tcp.h:1682 [inline] tcp_clean_rtx_queue net/ipv4/tcp_input.c:3111 [inline] tcp_ack+0x1b17/0x4fd0 net/ipv4/tcp_input.c:3593 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2778 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048 __sys_sendmsg+0xe5/0x210 net/socket.c:2082 SYSC_sendmsg net/socket.c:2093 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2089 entry_SYSCALL_64_fastpath+0x1f/0x96 The buggy address belongs to the object at ffff8801c154fa80 which belongs to the cache skbuff_fclone_cache of size 456 The buggy address is located 40 bytes inside of 456-byte region [ffff8801c154fa80, ffff8801c154fc48) The buggy address belongs to the page: page:ffffea00070553c0 count:1 mapcount:0 mapping:ffff8801c154f080 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffff8801c154f080 0000000000000000 0000000100000006 raw: ffffea00070a5a20 ffffea0006a18360 ffff8801d9ca0500 0000000000000000 page dumped because: kasan: bad access detected Fixes: 737ff314563c ("tcp: use sequence distance to detect reordering") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/tcp.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 35cc7d0..85ea578 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1630,9 +1630,6 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - - if (tcp_sk(sk)->highest_sack == skb_unlinked) - tcp_sk(sk)->highest_sack = NULL; } static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) @@ -1645,12 +1642,8 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb __tcp_add_write_queue_tail(sk, skb); /* Queue it, remembering where we must start sending. */ - if (sk->sk_write_queue.next == skb) { + if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); - - if (tcp_sk(sk)->highest_sack == NULL) - tcp_sk(sk)->highest_sack = skb; - } } /* Insert new before skb on the write queue of sk. */ @@ -1708,9 +1701,7 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { - struct sk_buff *next = skb_rb_next(skb); - - tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk); + tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) @@ -1720,9 +1711,7 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk) static inline void tcp_highest_sack_reset(struct sock *sk) { - struct sk_buff *skb = tcp_rtx_queue_head(sk); - - tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk); + tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ -- cgit v1.1