summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/ndisc.c2
-rw-r--r--net/batman-adv/bat_v_elp.c2
-rw-r--r--net/batman-adv/routing.c28
-rw-r--r--net/bluetooth/af_bluetooth.c15
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_request.c49
-rw-r--r--net/bluetooth/hci_request.h5
-rw-r--r--net/bluetooth/hci_sock.c396
-rw-r--r--net/bluetooth/leds.c27
-rw-r--r--net/bluetooth/leds.h10
-rw-r--r--net/bluetooth/mgmt.c353
-rw-r--r--net/bluetooth/mgmt_util.c66
-rw-r--r--net/bluetooth/smp.c5
-rw-r--r--net/bridge/br.c6
-rw-r--r--net/bridge/br_netfilter_hooks.c53
-rw-r--r--net/bridge/br_netfilter_ipv6.c12
-rw-r--r--net/bridge/netfilter/ebt_log.c2
-rw-r--r--net/bridge/netfilter/ebt_redirect.c2
-rw-r--r--net/bridge/netfilter/ebtables.c2
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c92
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c44
-rw-r--r--net/core/dev.c75
-rw-r--r--net/core/filter.c156
-rw-r--r--net/core/pktgen.c21
-rw-r--r--net/core/rtnetlink.c194
-rw-r--r--net/core/skbuff.c103
-rw-r--r--net/core/sock.c5
-rw-r--r--net/core/stream.c1
-rw-r--r--net/dsa/dsa.c8
-rw-r--r--net/dsa/dsa2.c12
-rw-r--r--net/dsa/slave.c35
-rw-r--r--net/ipv4/Kconfig18
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/fib_frontend.c29
-rw-r--r--net/ipv4/fib_rules.c12
-rw-r--r--net/ipv4/fib_trie.c166
-rw-r--r--net/ipv4/gre_offload.c6
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_vti.c15
-rw-r--r--net/ipv4/ipmr.c10
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c2
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c13
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c7
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c11
-rw-r--r--net/ipv4/proc.c102
-rw-r--r--net/ipv4/route.c13
-rw-r--r--net/ipv4/tcp.c26
-rw-r--r--net/ipv4/tcp_bbr.c896
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_input.c160
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_minisocks.c5
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_output.c118
-rw-r--r--net/ipv4/tcp_rate.c186
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv6/addrconf.c94
-rw-r--r--net/ipv6/ip6_gre.c3
-rw-r--r--net/ipv6/ip6_offload.c5
-rw-r--r--net/ipv6/ip6_vti.c19
-rw-r--r--net/ipv6/ip6mr.c12
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c18
-rw-r--r--net/ipv6/netfilter/nf_tables_ipv6.c9
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c14
-rw-r--r--net/ipv6/proc.c30
-rw-r--r--net/ipv6/route.c22
-rw-r--r--net/ipv6/xfrm6_input.c16
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/irda/af_irda.c5
-rw-r--r--net/mac80211/agg-rx.c8
-rw-r--r--net/mac80211/agg-tx.c3
-rw-r--r--net/mac80211/ieee80211_i.h2
-rw-r--r--net/mac80211/mesh_hwmp.c3
-rw-r--r--net/mac80211/mesh_pathtbl.c2
-rw-r--r--net/mac80211/rx.c7
-rw-r--r--net/mac80211/sta_info.c56
-rw-r--r--net/mac80211/sta_info.h19
-rw-r--r--net/mac80211/status.c7
-rw-r--r--net/mac80211/tx.c12
-rw-r--r--net/mac802154/iface.c1
-rw-r--r--net/mac802154/rx.c9
-rw-r--r--net/mpls/internal.h10
-rw-r--r--net/ncsi/internal.h22
-rw-r--r--net/ncsi/ncsi-aen.c37
-rw-r--r--net/ncsi/ncsi-cmd.c2
-rw-r--r--net/ncsi/ncsi-manage.c198
-rw-r--r--net/ncsi/ncsi-rsp.c4
-rw-r--r--net/netfilter/Makefile3
-rw-r--r--net/netfilter/core.c203
-rw-r--r--net/netfilter/nf_conntrack_core.c22
-rw-r--r--net/netfilter/nf_conntrack_ftp.c15
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c2
-rw-r--r--net/netfilter/nf_conntrack_helper.c17
-rw-r--r--net/netfilter/nf_conntrack_netlink.c6
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c14
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c20
-rw-r--r--net/netfilter/nf_conntrack_sip.c10
-rw-r--r--net/netfilter/nf_conntrack_standalone.c13
-rw-r--r--net/netfilter/nf_internals.h10
-rw-r--r--net/netfilter/nf_log_common.c4
-rw-r--r--net/netfilter/nf_nat_core.c5
-rw-r--r--net/netfilter/nf_queue.c18
-rw-r--r--net/netfilter/nf_tables_api.c25
-rw-r--r--net/netfilter/nf_tables_core.c16
-rw-r--r--net/netfilter/nf_tables_inet.c5
-rw-r--r--net/netfilter/nf_tables_netdev.c101
-rw-r--r--net/netfilter/nf_tables_trace.c22
-rw-r--r--net/netfilter/nfnetlink_cthelper.c2
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nfnetlink_queue.c19
-rw-r--r--net/netfilter/nft_bitwise.c8
-rw-r--r--net/netfilter/nft_byteorder.c15
-rw-r--r--net/netfilter/nft_cmp.c3
-rw-r--r--net/netfilter/nft_ct.c21
-rw-r--r--net/netfilter/nft_dynset.c20
-rw-r--r--net/netfilter/nft_exthdr.c12
-rw-r--r--net/netfilter/nft_hash.c17
-rw-r--r--net/netfilter/nft_immediate.c4
-rw-r--r--net/netfilter/nft_log.c9
-rw-r--r--net/netfilter/nft_lookup.c2
-rw-r--r--net/netfilter/nft_meta.c2
-rw-r--r--net/netfilter/nft_numgen.c54
-rw-r--r--net/netfilter/nft_payload.c4
-rw-r--r--net/netfilter/nft_queue.c113
-rw-r--r--net/netfilter/nft_quota.c8
-rw-r--r--net/netfilter/nft_range.c138
-rw-r--r--net/netfilter/xt_RATEEST.c6
-rw-r--r--net/netfilter/xt_TCPMSS.c12
-rw-r--r--net/netfilter/xt_TEE.c8
-rw-r--r--net/netfilter/xt_connlimit.c8
-rw-r--r--net/netfilter/xt_hashlimit.c339
-rw-r--r--net/netfilter/xt_helper.c4
-rw-r--r--net/netfilter/xt_recent.c7
-rw-r--r--net/netfilter/xt_sctp.c2
-rw-r--r--net/openvswitch/actions.c24
-rw-r--r--net/openvswitch/datapath.c21
-rw-r--r--net/openvswitch/flow.c13
-rw-r--r--net/openvswitch/flow_netlink.c6
-rw-r--r--net/openvswitch/flow_netlink.h3
-rw-r--r--net/rxrpc/Kconfig7
-rw-r--r--net/rxrpc/af_rxrpc.c26
-rw-r--r--net/rxrpc/ar-internal.h326
-rw-r--r--net/rxrpc/call_accept.c17
-rw-r--r--net/rxrpc/call_event.c217
-rw-r--r--net/rxrpc/call_object.c101
-rw-r--r--net/rxrpc/conn_client.c149
-rw-r--r--net/rxrpc/conn_event.c14
-rw-r--r--net/rxrpc/conn_object.c73
-rw-r--r--net/rxrpc/conn_service.c4
-rw-r--r--net/rxrpc/input.c504
-rw-r--r--net/rxrpc/local_event.c7
-rw-r--r--net/rxrpc/local_object.c3
-rw-r--r--net/rxrpc/misc.c186
-rw-r--r--net/rxrpc/output.c200
-rw-r--r--net/rxrpc/peer_event.c51
-rw-r--r--net/rxrpc/peer_object.c1
-rw-r--r--net/rxrpc/recvmsg.c139
-rw-r--r--net/rxrpc/rxkad.c17
-rw-r--r--net/rxrpc/security.c8
-rw-r--r--net/rxrpc/sendmsg.c100
-rw-r--r--net/rxrpc/skbuff.c48
-rw-r--r--net/rxrpc/sysctl.c10
-rw-r--r--net/sched/Kconfig5
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c36
-rw-r--r--net/sched/act_csum.c36
-rw-r--r--net/sched/act_gact.c3
-rw-r--r--net/sched/act_ife.c33
-rw-r--r--net/sched/act_meta_skbtcindex.c79
-rw-r--r--net/sched/act_mirred.c11
-rw-r--r--net/sched/act_police.c12
-rw-r--r--net/sched/act_vlan.c38
-rw-r--r--net/sched/cls_api.c18
-rw-r--r--net/sched/cls_bpf.c123
-rw-r--r--net/sched/cls_flow.c21
-rw-r--r--net/sched/cls_flower.c7
-rw-r--r--net/sched/cls_fw.c10
-rw-r--r--net/sched/cls_route.c12
-rw-r--r--net/sched/cls_tcindex.c12
-rw-r--r--net/sched/cls_u32.c30
-rw-r--r--net/sched/sch_api.c41
-rw-r--r--net/sched/sch_codel.c4
-rw-r--r--net/sched/sch_fifo.c4
-rw-r--r--net/sched/sch_fq.c43
-rw-r--r--net/sched/sch_generic.c28
-rw-r--r--net/sched/sch_htb.c24
-rw-r--r--net/sched/sch_netem.c20
-rw-r--r--net/sched/sch_pie.c4
-rw-r--r--net/sched/sch_qfq.c3
-rw-r--r--net/sched/sch_sfb.c3
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/chunk.c24
-rw-r--r--net/sctp/input.c35
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/output.c12
-rw-r--r--net/sctp/outqueue.c23
-rw-r--r--net/sctp/proc.c10
-rw-r--r--net/sctp/sctp_diag.c58
-rw-r--r--net/sctp/sm_make_chunk.c43
-rw-r--r--net/sctp/sm_statefuns.c6
-rw-r--r--net/sctp/socket.c10
-rw-r--r--net/sctp/transport.c4
-rw-r--r--net/sctp/ulpevent.c4
-rw-r--r--net/sctp/ulpqueue.c3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c5
-rw-r--r--net/sunrpc/xprtrdma/verbs.c45
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h1
-rw-r--r--net/sunrpc/xprtsock.c2
-rw-r--r--net/switchdev/switchdev.c181
-rw-r--r--net/sysctl_net.c2
-rw-r--r--net/vmw_vsock/af_vsock.c6
-rw-r--r--net/wireless/nl80211.c2
-rw-r--r--net/xfrm/xfrm_proc.c10
-rw-r--r--net/xfrm/xfrm_state.c5
-rw-r--r--net/xfrm/xfrm_user.c9
226 files changed, 6750 insertions, 2382 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index 86450b7..941df2f 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
- } else {
- neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
}
write_unlock_bh(&n->lock);
}
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 7d17001..ee08540 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -335,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
goto out;
skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
- elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+ elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
elp_packet = (struct batadv_elp_packet *)elp_buff;
memset(elp_packet, 0, BATADV_ELP_HLEN);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 610f2c4..7e8dc64 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -461,6 +461,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
}
/**
+ * batadv_last_bonding_get - Get last_bonding_candidate of orig_node
+ * @orig_node: originator node whose last bonding candidate should be retrieved
+ *
+ * Return: last bonding candidate of router or NULL if not found
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+static struct batadv_orig_ifinfo *
+batadv_last_bonding_get(struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_ifinfo *last_bonding_candidate;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ last_bonding_candidate = orig_node->last_bonding_candidate;
+
+ if (last_bonding_candidate)
+ kref_get(&last_bonding_candidate->refcount);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ return last_bonding_candidate;
+}
+
+/**
* batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
* @orig_node: originator node whose bonding candidates should be replaced
* @new_candidate: new bonding candidate or NULL
@@ -530,7 +553,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
* router - obviously there are no other candidates.
*/
rcu_read_lock();
- last_candidate = orig_node->last_bonding_candidate;
+ last_candidate = batadv_last_bonding_get(orig_node);
if (last_candidate)
last_cand_router = rcu_dereference(last_candidate->router);
@@ -622,6 +645,9 @@ next:
batadv_orig_ifinfo_put(next_candidate);
}
+ if (last_candidate)
+ batadv_orig_ifinfo_put(last_candidate);
+
return router;
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 0b5f729..1aff2da 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -26,11 +26,13 @@
#include <linux/module.h>
#include <linux/debugfs.h>
+#include <linux/stringify.h>
#include <asm/ioctls.h>
#include <net/bluetooth/bluetooth.h>
#include <linux/proc_fs.h>
+#include "leds.h"
#include "selftest.h"
/* Bluetooth sockets */
@@ -712,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = {
struct dentry *bt_debugfs;
EXPORT_SYMBOL_GPL(bt_debugfs);
+#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
+ __stringify(BT_SUBSYS_REVISION)
+
static int __init bt_init(void)
{
int err;
sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
- BT_INFO("Core ver %s", BT_SUBSYS_VERSION);
+ BT_INFO("Core ver %s", VERSION);
err = bt_selftest();
if (err < 0)
@@ -726,6 +731,8 @@ static int __init bt_init(void)
bt_debugfs = debugfs_create_dir("bluetooth", NULL);
+ bt_leds_init();
+
err = bt_sysfs_init();
if (err < 0)
return err;
@@ -785,6 +792,8 @@ static void __exit bt_exit(void)
bt_sysfs_cleanup();
+ bt_leds_cleanup();
+
debugfs_remove_recursive(bt_debugfs);
}
@@ -792,7 +801,7 @@ subsys_initcall(bt_init);
module_exit(bt_exit);
MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION);
-MODULE_VERSION(BT_SUBSYS_VERSION);
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ddf8432..3ac89e9 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
hci_dev_test_flag(hdev, HCI_MGMT))
__mgmt_power_off(hdev);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index b0e23df..c813568 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -971,14 +971,14 @@ void __hci_req_enable_advertising(struct hci_request *req)
hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
}
-static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
{
- u8 ad_len = 0;
size_t name_len;
+ int max_len;
+ max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
name_len = strlen(hdev->dev_name);
- if (name_len > 0) {
- size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
+ if (name_len > 0 && max_len > 0) {
if (name_len > max_len) {
name_len = max_len;
@@ -997,22 +997,42 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
return ad_len;
}
+static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+ return append_local_name(hdev, ptr, 0);
+}
+
static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
u8 *ptr)
{
struct adv_info *adv_instance;
+ u32 instance_flags;
+ u8 scan_rsp_len = 0;
adv_instance = hci_find_adv_instance(hdev, instance);
if (!adv_instance)
return 0;
- /* TODO: Set the appropriate entries based on advertising instance flags
- * here once flags other than 0 are supported.
- */
+ instance_flags = adv_instance->flags;
+
+ if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
+ ptr[0] = 3;
+ ptr[1] = EIR_APPEARANCE;
+ put_unaligned_le16(hdev->appearance, ptr + 2);
+ scan_rsp_len += 4;
+ ptr += 4;
+ }
+
memcpy(ptr, adv_instance->scan_rsp_data,
adv_instance->scan_rsp_len);
- return adv_instance->scan_rsp_len;
+ scan_rsp_len += adv_instance->scan_rsp_len;
+ ptr += adv_instance->scan_rsp_len;
+
+ if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len);
+
+ return scan_rsp_len;
}
void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
@@ -1194,7 +1214,7 @@ static void adv_timeout_expire(struct work_struct *work)
hci_req_init(&req, hdev);
- hci_req_clear_adv_instance(hdev, &req, instance, false);
+ hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
if (list_empty(&hdev->adv_instances))
__hci_req_disable_advertising(&req);
@@ -1284,8 +1304,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
* setting.
* - force == false: Only instances that have a timeout will be removed.
*/
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force)
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+ struct hci_request *req, u8 instance,
+ bool force)
{
struct adv_info *adv_instance, *n, *next_instance = NULL;
int err;
@@ -1311,7 +1332,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
rem_inst = adv_instance->instance;
err = hci_remove_adv_instance(hdev, rem_inst);
if (!err)
- mgmt_advertising_removed(NULL, hdev, rem_inst);
+ mgmt_advertising_removed(sk, hdev, rem_inst);
}
} else {
adv_instance = hci_find_adv_instance(hdev, instance);
@@ -1325,7 +1346,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
err = hci_remove_adv_instance(hdev, instance);
if (!err)
- mgmt_advertising_removed(NULL, hdev, instance);
+ mgmt_advertising_removed(sk, hdev, instance);
}
}
@@ -1716,7 +1737,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
* function. To be safe hard-code one of the
* values that's suitable for SCO.
*/
- rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES;
+ rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
sizeof(rej), &rej);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index b2d044b..ac1e110 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -73,8 +73,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
bool force);
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force);
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+ struct hci_request *req, u8 instance,
+ bool force);
void __hci_req_update_class(struct hci_request *req);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 96f04b7..48f9471 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/utsname.h>
+#include <linux/sched.h>
#include <asm/unaligned.h>
#include <net/bluetooth/bluetooth.h>
@@ -38,6 +39,8 @@
static LIST_HEAD(mgmt_chan_list);
static DEFINE_MUTEX(mgmt_chan_list_lock);
+static DEFINE_IDA(sock_cookie_ida);
+
static atomic_t monitor_promisc = ATOMIC_INIT(0);
/* ----- HCI socket interface ----- */
@@ -52,6 +55,8 @@ struct hci_pinfo {
__u32 cmsg_mask;
unsigned short channel;
unsigned long flags;
+ __u32 cookie;
+ char comm[TASK_COMM_LEN];
};
void hci_sock_set_flag(struct sock *sk, int nr)
@@ -74,6 +79,38 @@ unsigned short hci_sock_get_channel(struct sock *sk)
return hci_pi(sk)->channel;
}
+u32 hci_sock_get_cookie(struct sock *sk)
+{
+ return hci_pi(sk)->cookie;
+}
+
+static bool hci_sock_gen_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (!id) {
+ id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+ if (id < 0)
+ id = 0xffffffff;
+
+ hci_pi(sk)->cookie = id;
+ get_task_comm(hci_pi(sk)->comm, current);
+ return true;
+ }
+
+ return false;
+}
+
+static void hci_sock_free_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (id) {
+ hci_pi(sk)->cookie = 0xffffffff;
+ ida_simple_remove(&sock_cookie_ida, id);
+ }
+}
+
static inline int hci_test_bit(int nr, const void *addr)
{
return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -305,6 +342,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(skb_copy);
}
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+ void *data, u16 data_len, ktime_t tstamp,
+ int flag, struct sock *skip_sk)
+{
+ struct sock *sk;
+ __le16 index;
+
+ if (hdev)
+ index = cpu_to_le16(hdev->id);
+ else
+ index = cpu_to_le16(MGMT_INDEX_NONE);
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+ continue;
+
+ /* Ignore socket without the flag set */
+ if (!hci_sock_test_flag(sk, flag))
+ continue;
+
+ /* Skip the original socket */
+ if (sk == skip_sk)
+ continue;
+
+ skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
+ if (!skb)
+ continue;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(event, skb_put(skb, 2));
+
+ if (data)
+ memcpy(skb_put(skb, data_len), data, data_len);
+
+ skb->tstamp = tstamp;
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
{
struct hci_mon_hdr *hdr;
@@ -384,6 +475,129 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
return skb;
}
+static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ u16 format;
+ u8 ver[3];
+ u32 flags;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ format = 0x0000;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_USER:
+ format = 0x0001;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_CONTROL:
+ format = 0x0002;
+ mgmt_fill_version_info(ver);
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(format, skb_put(skb, 2));
+ memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver));
+ put_unaligned_le32(flags, skb_put(skb, 4));
+ *skb_put(skb, 1) = TASK_COMM_LEN;
+ memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(4, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
+ u16 opcode, u16 len,
+ const void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ memcpy(skb_put(skb, len), buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
static void __printf(2, 3)
send_monitor_note(struct sock *sk, const char *fmt, ...)
{
@@ -458,6 +672,26 @@ static void send_monitor_replay(struct sock *sk)
read_unlock(&hci_dev_list_lock);
}
+static void send_monitor_control_replay(struct sock *mon_sk)
+{
+ struct sock *sk;
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_ctrl_open(sk);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(mon_sk, skb))
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
/* Generate internal stack event */
static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
{
@@ -585,6 +819,7 @@ static int hci_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct hci_dev *hdev;
+ struct sk_buff *skb;
BT_DBG("sock %p sk %p", sock, sk);
@@ -593,8 +828,24 @@ static int hci_sock_release(struct socket *sock)
hdev = hci_pi(sk)->hdev;
- if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_MONITOR:
atomic_dec(&monitor_promisc);
+ break;
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ hci_sock_free_cookie(sk);
+ break;
+ }
bt_sock_unlink(&hci_sk_list, sk);
@@ -721,6 +972,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
goto done;
}
+ /* When calling an ioctl on an unbound raw socket, then ensure
+ * that the monitor gets informed. Ensure that the resulting event
+ * is only send once by checking if the cookie exists or not. The
+ * socket cookie will be only ever generated once for the lifetime
+ * of a given socket.
+ */
+ if (hci_sock_gen_cookie(sk)) {
+ struct sk_buff *skb;
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
release_sock(sk);
switch (cmd) {
@@ -784,6 +1056,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
struct sockaddr_hci haddr;
struct sock *sk = sock->sk;
struct hci_dev *hdev = NULL;
+ struct sk_buff *skb;
int len, err = 0;
BT_DBG("sock %p sk %p", sock, sk);
@@ -822,7 +1095,35 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
atomic_inc(&hdev->promisc);
}
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * then there has been already an ioctl issued against
+ * an unbound socket and with that triggerd an open
+ * notification. Send a close notification first to
+ * allow the state transition to bounded.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
break;
case HCI_CHANNEL_USER:
@@ -884,9 +1185,38 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
}
}
- atomic_inc(&hdev->promisc);
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * this socket will transition from a raw socket into
+ * an user channel socket. For a clean transition, send
+ * the close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* The user channel is restricted to CAP_NET_ADMIN
+ * capabilities and with that implicitly trusted.
+ */
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ atomic_inc(&hdev->promisc);
break;
case HCI_CHANNEL_MONITOR:
@@ -900,6 +1230,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
goto done;
}
+ hci_pi(sk)->channel = haddr.hci_channel;
+
/* The monitor interface is restricted to CAP_NET_RAW
* capabilities and with that implicitly trusted.
*/
@@ -908,9 +1240,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
send_monitor_note(sk, "Linux version %s (%s)",
init_utsname()->release,
init_utsname()->machine);
- send_monitor_note(sk, "Bluetooth subsystem version %s",
- BT_SUBSYS_VERSION);
+ send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
+ BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
send_monitor_replay(sk);
+ send_monitor_control_replay(sk);
atomic_inc(&monitor_promisc);
break;
@@ -925,6 +1258,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
err = -EPERM;
goto done;
}
+
+ hci_pi(sk)->channel = haddr.hci_channel;
break;
default:
@@ -946,6 +1281,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
if (capable(CAP_NET_ADMIN))
hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+ hci_pi(sk)->channel = haddr.hci_channel;
+
/* At the moment the index and unconfigured index events
* are enabled unconditionally. Setting them on each
* socket when binding keeps this functionality. They
@@ -956,16 +1293,40 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
* received by untrusted users. Example for such events
* are changes to settings, class of device, name etc.
*/
- if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been
+ * assigned, this socket will transtion from
+ * a raw socket into a control socket. To
+ * allow for a clean transtion, send the
+ * close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
- hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
}
break;
}
-
- hci_pi(sk)->channel = haddr.hci_channel;
sk->sk_state = BT_BOUND;
done:
@@ -1133,6 +1494,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
goto done;
}
+ if (chan->channel == HCI_CHANNEL_CONTROL) {
+ struct sk_buff *skb;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_command(sk, index, opcode, len,
+ buf + sizeof(*hdr));
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
if (opcode >= chan->handler_count ||
chan->handlers[opcode].func == NULL) {
BT_DBG("Unknown op %u", opcode);
@@ -1440,6 +1814,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
+ if (level != SOL_HCI)
+ return -ENOPROTOOPT;
+
lock_sock(sk);
if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -1523,6 +1900,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
+ if (level != SOL_HCI)
+ return -ENOPROTOOPT;
+
if (get_user(len, optlen))
return -EFAULT;
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index 8319c84..cb670b5 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -11,6 +11,8 @@
#include "leds.h"
+DEFINE_LED_TRIGGER(bt_power_led_trigger);
+
struct hci_basic_led_trigger {
struct led_trigger led_trigger;
struct hci_dev *hdev;
@@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
if (hdev->power_led)
led_trigger_event(hdev->power_led,
enabled ? LED_FULL : LED_OFF);
+
+ if (!enabled) {
+ struct hci_dev *d;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (test_bit(HCI_UP, &d->flags))
+ enabled = true;
+ }
+
+ read_unlock(&hci_dev_list_lock);
+ }
+
+ led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
}
static void power_activate(struct led_classdev *led_cdev)
@@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev)
/* initialize power_led */
hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
}
+
+void bt_leds_init(void)
+{
+ led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger);
+}
+
+void bt_leds_cleanup(void)
+{
+ led_trigger_unregister_simple(bt_power_led_trigger);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index a9c4d6e..08725a2 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -7,10 +7,20 @@
*/
#if IS_ENABLED(CONFIG_BT_LEDS)
+
void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
void hci_leds_init(struct hci_dev *hdev);
+
+void bt_leds_init(void);
+void bt_leds_cleanup(void);
+
#else
+
static inline void hci_leds_update_powered(struct hci_dev *hdev,
bool enabled) {}
static inline void hci_leds_init(struct hci_dev *hdev) {}
+
+static inline void bt_leds_init(void) {}
+static inline void bt_leds_cleanup(void) {}
+
#endif
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7639290..19b8a5e 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 13
+#define MGMT_REVISION 14
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -104,6 +104,8 @@ static const u16 mgmt_commands[] = {
MGMT_OP_REMOVE_ADVERTISING,
MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_OP_START_LIMITED_DISCOVERY,
+ MGMT_OP_READ_EXT_INFO,
+ MGMT_OP_SET_APPEARANCE,
};
static const u16 mgmt_events[] = {
@@ -141,6 +143,7 @@ static const u16 mgmt_events[] = {
MGMT_EV_LOCAL_OOB_DATA_UPDATED,
MGMT_EV_ADVERTISING_ADDED,
MGMT_EV_ADVERTISING_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
};
static const u16 mgmt_untrusted_commands[] = {
@@ -149,6 +152,7 @@ static const u16 mgmt_untrusted_commands[] = {
MGMT_OP_READ_UNCONF_INDEX_LIST,
MGMT_OP_READ_CONFIG_INFO,
MGMT_OP_READ_EXT_INDEX_LIST,
+ MGMT_OP_READ_EXT_INFO,
};
static const u16 mgmt_untrusted_events[] = {
@@ -162,6 +166,7 @@ static const u16 mgmt_untrusted_events[] = {
MGMT_EV_NEW_CONFIG_OPTIONS,
MGMT_EV_EXT_INDEX_ADDED,
MGMT_EV_EXT_INDEX_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
};
#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
@@ -256,13 +261,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
flag, skip_sk);
}
-static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
- u16 len, struct sock *skip_sk)
-{
- return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
- HCI_MGMT_GENERIC_EVENTS, skip_sk);
-}
-
static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
struct sock *skip_sk)
{
@@ -278,6 +276,14 @@ static u8 le_addr_type(u8 mgmt_addr_type)
return ADDR_LE_DEV_RANDOM;
}
+void mgmt_fill_version_info(void *ver)
+{
+ struct mgmt_rp_read_version *rp = ver;
+
+ rp->version = MGMT_VERSION;
+ rp->revision = cpu_to_le16(MGMT_REVISION);
+}
+
static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
{
@@ -285,8 +291,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
BT_DBG("sock %p", sk);
- rp.version = MGMT_VERSION;
- rp.revision = cpu_to_le16(MGMT_REVISION);
+ mgmt_fill_version_info(&rp);
return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
&rp, sizeof(rp));
@@ -572,8 +577,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip)
{
__le32 options = get_missing_options(hdev);
- return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
- sizeof(options), skip);
+ return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+ sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
}
static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
@@ -862,6 +867,107 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
sizeof(rp));
}
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
+ u8 data_len)
+{
+ eir[eir_len++] = sizeof(type) + data_len;
+ eir[eir_len++] = type;
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
+{
+ eir[eir_len++] = sizeof(type) + sizeof(data);
+ eir[eir_len++] = type;
+ put_unaligned_le16(data, &eir[eir_len]);
+ eir_len += sizeof(data);
+
+ return eir_len;
+}
+
+static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
+{
+ u16 eir_len = 0;
+ size_t name_len;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
+ hdev->appearance);
+
+ name_len = strlen(hdev->dev_name);
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, name_len);
+
+ name_len = strlen(hdev->short_name);
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
+ hdev->short_name, name_len);
+
+ return eir_len;
+}
+
+static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ char buf[512];
+ struct mgmt_rp_read_ext_info *rp = (void *)buf;
+ u16 eir_len;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ memset(&buf, 0, sizeof(buf));
+
+ hci_dev_lock(hdev);
+
+ bacpy(&rp->bdaddr, &hdev->bdaddr);
+
+ rp->version = hdev->hci_ver;
+ rp->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
+ rp->current_settings = cpu_to_le32(get_current_settings(hdev));
+
+
+ eir_len = append_eir_data_to_buf(hdev, rp->eir);
+ rp->eir_len = cpu_to_le16(eir_len);
+
+ hci_dev_unlock(hdev);
+
+ /* If this command is called at least once, then the events
+ * for class of device and local name changes are disabled
+ * and only the new extended controller information event
+ * is used.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp,
+ sizeof(*rp) + eir_len);
+}
+
+static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ char buf[512];
+ struct mgmt_ev_ext_info_changed *ev = (void *)buf;
+ u16 eir_len;
+
+ memset(buf, 0, sizeof(buf));
+
+ eir_len = append_eir_data_to_buf(hdev, ev->eir);
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev,
+ sizeof(*ev) + eir_len,
+ HCI_MGMT_EXT_INFO_EVENTS, skip);
+}
+
static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
{
__le32 settings = cpu_to_le32(get_current_settings(hdev));
@@ -922,7 +1028,7 @@ static int clean_up_hci_state(struct hci_dev *hdev)
hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
- hci_req_clear_adv_instance(hdev, NULL, 0x00, false);
+ hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false);
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
__hci_req_disable_advertising(&req);
@@ -1000,8 +1106,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip)
{
__le32 ev = cpu_to_le32(get_current_settings(hdev));
- return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
- sizeof(ev), skip);
+ return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+ sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
}
int mgmt_new_settings(struct hci_dev *hdev)
@@ -1690,7 +1796,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
enabled = lmp_host_le_capable(hdev);
if (!val)
- hci_req_clear_adv_instance(hdev, NULL, 0x00, true);
+ hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
if (!hdev_is_powered(hdev) || val == enabled) {
bool changed = false;
@@ -2435,6 +2541,8 @@ static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
if (!cmd)
return -ENOMEM;
+ cmd->cmd_complete = addr_cmd_complete;
+
err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
sizeof(cp->addr.bdaddr), &cp->addr.bdaddr);
if (err < 0)
@@ -2513,8 +2621,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
BT_DBG("");
if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
- return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
- MGMT_STATUS_INVALID_PARAMS, NULL, 0);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_STATUS_INVALID_PARAMS);
hci_dev_lock(hdev);
@@ -2932,6 +3040,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
}
+static void adv_expire(struct hci_dev *hdev, u32 flags)
+{
+ struct adv_info *adv_instance;
+ struct hci_request req;
+ int err;
+
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+ if (!adv_instance)
+ return;
+
+ /* stop if current instance doesn't need to be changed */
+ if (!(adv_instance->flags & flags))
+ return;
+
+ cancel_adv_timeout(hdev);
+
+ adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
+ if (!adv_instance)
+ return;
+
+ hci_req_init(&req, hdev);
+ err = __hci_req_schedule_adv_instance(&req, adv_instance->instance,
+ true);
+ if (err)
+ return;
+
+ hci_req_run(&req, NULL);
+}
+
static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
struct mgmt_cp_set_local_name *cp;
@@ -2947,13 +3084,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
cp = cmd->param;
- if (status)
+ if (status) {
mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
mgmt_status(status));
- else
+ } else {
mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
cp, sizeof(*cp));
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+ }
+
mgmt_pending_remove(cmd);
unlock:
@@ -2993,8 +3134,9 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
if (err < 0)
goto failed;
- err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev,
- data, len, sk);
+ err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
+ len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
+ ext_info_changed(hdev, sk);
goto failed;
}
@@ -3017,7 +3159,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
/* The name is stored in the scan response data and so
* no need to udpate the advertising data here.
*/
- if (lmp_le_capable(hdev))
+ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING))
__hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance);
err = hci_req_run(&req, set_name_complete);
@@ -3029,6 +3171,40 @@ failed:
return err;
}
+static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_appearance *cp = data;
+ u16 apperance;
+ int err;
+
+ BT_DBG("");
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ apperance = le16_to_cpu(cp->appearance);
+
+ hci_dev_lock(hdev);
+
+ if (hdev->appearance != apperance) {
+ hdev->appearance = apperance;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+
+ ext_info_changed(hdev, sk);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
+ 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -4869,7 +5045,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
int err;
memset(&rp, 0, sizeof(rp));
- memcpy(&rp.addr, &cmd->param, sizeof(rp.addr));
+ memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
if (status)
goto complete;
@@ -5501,17 +5677,6 @@ unlock:
return err;
}
-static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
- u8 data_len)
-{
- eir[eir_len++] = sizeof(type) + data_len;
- eir[eir_len++] = type;
- memcpy(&eir[eir_len], data, data_len);
- eir_len += data_len;
-
- return eir_len;
-}
-
static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -5815,6 +5980,8 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_DISCOV;
flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+ flags |= MGMT_ADV_FLAG_APPEARANCE;
+ flags |= MGMT_ADV_FLAG_LOCAL_NAME;
if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
flags |= MGMT_ADV_FLAG_TX_POWER;
@@ -5871,28 +6038,59 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
return err;
}
-static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
- u8 len, bool is_adv_data)
+static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
{
u8 max_len = HCI_MAX_AD_LENGTH;
- int i, cur_len;
- bool flags_managed = false;
- bool tx_power_managed = false;
if (is_adv_data) {
if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS)) {
- flags_managed = true;
+ MGMT_ADV_FLAG_MANAGED_FLAGS))
max_len -= 3;
- }
- if (adv_flags & MGMT_ADV_FLAG_TX_POWER) {
- tx_power_managed = true;
+ if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
max_len -= 3;
- }
+ } else {
+ /* at least 1 byte of name should fit in */
+ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ max_len -= 3;
+
+ if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
+ max_len -= 4;
}
+ return max_len;
+}
+
+static bool flags_managed(u32 adv_flags)
+{
+ return adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS);
+}
+
+static bool tx_power_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_TX_POWER;
+}
+
+static bool name_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME;
+}
+
+static bool appearance_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_APPEARANCE;
+}
+
+static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
+{
+ int i, cur_len;
+ u8 max_len;
+
+ max_len = tlv_data_max_len(adv_flags, is_adv_data);
+
if (len > max_len)
return false;
@@ -5900,10 +6098,21 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
cur_len = data[i];
- if (flags_managed && data[i + 1] == EIR_FLAGS)
+ if (data[i + 1] == EIR_FLAGS &&
+ (!is_adv_data || flags_managed(adv_flags)))
+ return false;
+
+ if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags))
return false;
- if (tx_power_managed && data[i + 1] == EIR_TX_POWER)
+ if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_APPEARANCE &&
+ appearance_managed(adv_flags))
return false;
/* If the current field length would exceed the total data
@@ -6027,8 +6236,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) ||
- !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len,
+ if (!tlv_data_is_valid(flags, cp->data, cp->adv_data_len, true) ||
+ !tlv_data_is_valid(flags, cp->data + cp->adv_data_len,
cp->scan_rsp_len, false)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6175,7 +6384,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- hci_req_clear_adv_instance(hdev, &req, cp->instance, true);
+ hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true);
if (list_empty(&hdev->adv_instances))
__hci_req_disable_advertising(&req);
@@ -6211,23 +6420,6 @@ unlock:
return err;
}
-static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
-{
- u8 max_len = HCI_MAX_AD_LENGTH;
-
- if (is_adv_data) {
- if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
- MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS))
- max_len -= 3;
-
- if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
- max_len -= 3;
- }
-
- return max_len;
-}
-
static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
void *data, u16 data_len)
{
@@ -6356,6 +6548,9 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
{ get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE },
{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
+ { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_appearance, MGMT_SET_APPEARANCE_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
@@ -6494,9 +6689,12 @@ void __mgmt_power_off(struct hci_dev *hdev)
mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
- if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
- mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
- zero_cod, sizeof(zero_cod), NULL);
+ if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+ zero_cod, sizeof(zero_cod),
+ HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
new_settings(hdev, match.sk);
@@ -7092,9 +7290,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
- if (!status)
- mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
- dev_class, 3, NULL);
+ if (!status) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
+ 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
if (match.sk)
sock_put(match.sk);
@@ -7123,8 +7323,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
return;
}
- mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
- cmd ? cmd->sk : NULL);
+ mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
+ ext_info_changed(hdev, cmd ? cmd->sk : NULL);
}
static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 8c30c7e..c933bd0 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,12 +21,41 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <asm/unaligned.h>
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
#include <net/bluetooth/mgmt.h>
#include "mgmt_util.h"
+static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
+ u16 opcode, u16 len, void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ memcpy(skb_put(skb, len), buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
void *data, u16 data_len, int flag, struct sock *skip_sk)
{
@@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
__net_timestamp(skb);
hci_send_to_channel(channel, skb, flag, skip_sk);
- kfree_skb(skb);
+ if (channel == HCI_CHANNEL_CONTROL)
+ hci_send_monitor_ctrl_event(hdev, event, data, data_len,
+ skb_get_ktime(skb), flag, skip_sk);
+
+ kfree_skb(skb);
return 0;
}
int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *mskb;
struct mgmt_hdr *hdr;
struct mgmt_ev_cmd_status *ev;
int err;
@@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
ev->status = status;
ev->opcode = cpu_to_le16(cmd);
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
err = sock_queue_rcv_skb(sk, skb);
if (err < 0)
kfree_skb(skb);
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
return err;
}
int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
void *rp, size_t rp_len)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *mskb;
struct mgmt_hdr *hdr;
struct mgmt_ev_cmd_complete *ev;
int err;
@@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
if (rp)
memcpy(ev->data, rp, rp_len);
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_COMPLETE,
+ sizeof(*ev) + rp_len, ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
err = sock_queue_rcv_skb(sk, skb);
if (err < 0)
kfree_skb(skb);
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
return err;
}
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 4c1a16a..43faf2a 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3387,7 +3387,10 @@ int smp_register(struct hci_dev *hdev)
if (!lmp_sc_capable(hdev)) {
debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
hdev, &force_bredr_smp_fops);
- return 0;
+
+ /* Flag can be already set here (due to power toggle) */
+ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return 0;
}
if (WARN_ON(hdev->smp_bredr_data)) {
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 3addc05..889e564 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -227,9 +227,11 @@ static int __init br_init(void)
br_fdb_test_addr_hook = br_fdb_test_addr;
#endif
- pr_info("bridge: automatic filtering via arp/ip/ip6tables has been "
- "deprecated. Update your scripts to load br_netfilter if you "
+#if IS_MODULE(CONFIG_BRIDGE_NETFILTER)
+ pr_info("bridge: filtering via arp/ip/ip6tables is no longer available "
+ "by default. Update your scripts to load br_netfilter if you "
"need this.\n");
+#endif
return 0;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 77e7f69..2fe9345 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -30,6 +30,7 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
#include <linux/in_route.h>
+#include <linux/rculist.h>
#include <linux/inetdevice.h>
#include <net/ip.h>
@@ -395,11 +396,10 @@ bridged_dnat:
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE,
- NF_BR_PRE_ROUTING,
- net, sk, skb, skb->dev, NULL,
- br_nf_pre_routing_finish_bridge,
- 1);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev,
+ NULL,
+ br_nf_pre_routing_finish);
return 0;
}
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -417,10 +417,8 @@ bridged_dnat:
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
- skb->dev, NULL,
- br_handle_frame_finish, 1);
-
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
+ br_handle_frame_finish);
return 0;
}
@@ -992,6 +990,43 @@ static struct notifier_block brnf_notifier __read_mostly = {
.notifier_call = brnf_device_event,
};
+/* recursively invokes nf_hook_slow (again), skipping already-called
+ * hooks (< NF_BR_PRI_BRNF).
+ *
+ * Called with rcu read lock held.
+ */
+int br_nf_hook_thresh(unsigned int hook, struct net *net,
+ struct sock *sk, struct sk_buff *skb,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct net *, struct sock *,
+ struct sk_buff *))
+{
+ struct nf_hook_entry *elem;
+ struct nf_hook_state state;
+ int ret;
+
+ elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+
+ while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF))
+ elem = rcu_dereference(elem->next);
+
+ if (!elem)
+ return okfn(net, sk, skb);
+
+ /* We may already have this, but read-locks nest anyway */
+ rcu_read_lock();
+ nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1,
+ NFPROTO_BRIDGE, indev, outdev, sk, net, okfn);
+
+ ret = nf_hook_slow(skb, &state);
+ rcu_read_unlock();
+ if (ret == 1)
+ ret = okfn(net, sk, skb);
+
+ return ret;
+}
+
#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5e59a84..5989661 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
- net, sk, skb, skb->dev, NULL,
- br_nf_pre_routing_finish_bridge,
- 1);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING,
+ net, sk, skb, skb->dev, NULL,
+ br_nf_pre_routing_finish_bridge);
return 0;
}
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
@@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
skb->dev = nf_bridge->physindev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
- NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb,
- skb->dev, NULL,
- br_handle_frame_finish, 1);
+ br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
+ skb->dev, NULL, br_handle_frame_finish);
return 0;
}
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 152300d..9a11086 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
if (loginfo->type == NF_LOG_TYPE_LOG)
bitmask = loginfo->u.log.logflags;
else
- bitmask = NF_LOG_MASK;
+ bitmask = NF_LOG_DEFAULT_MASK;
if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
htons(ETH_P_IP)) {
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 20396499..2e7c4f9 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -24,7 +24,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
return EBT_DROP;
if (par->hooknum != NF_BR_BROUTING)
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
ether_addr_copy(eth_hdr(skb)->h_dest,
br_port_get_rcu(par->in)->br->dev->dev_addr);
else
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0833c25..f5c11bb 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -146,7 +146,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
return 1;
if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out)))
return 1;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
if (in && (p = br_port_get_rcu(in)) != NULL &&
NF_INVF(e, EBT_ILOGICALIN,
ebt_dev_check(e->logical_in, p->br->dev)))
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index a78c4e2..97afdc0 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -13,79 +13,11 @@
#include <linux/module.h>
#include <linux/netfilter_bridge.h>
#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_bridge.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/netfilter/nf_tables_ipv4.h>
#include <net/netfilter/nf_tables_ipv6.h>
-int nft_bridge_iphdr_validate(struct sk_buff *skb)
-{
- struct iphdr *iph;
- u32 len;
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return 0;
-
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
- return 0;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len)
- return 0;
- else if (len < (iph->ihl*4))
- return 0;
-
- if (!pskb_may_pull(skb, iph->ihl*4))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate);
-
-int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
-{
- struct ipv6hdr *hdr;
- u32 pkt_len;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- return 0;
-
- hdr = ipv6_hdr(skb);
- if (hdr->version != 6)
- return 0;
-
- pkt_len = ntohs(hdr->payload_len);
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate);
-
-static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- if (nft_bridge_iphdr_validate(skb))
- nft_set_pktinfo_ipv4(pkt, skb, state);
- else
- nft_set_pktinfo(pkt, skb, state);
-}
-
-static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_IPV6)
- if (nft_bridge_ip6hdr_validate(skb) &&
- nft_set_pktinfo_ipv6(pkt, skb, state) == 0)
- return;
-#endif
- nft_set_pktinfo(pkt, skb, state);
-}
-
static unsigned int
nft_do_chain_bridge(void *priv,
struct sk_buff *skb,
@@ -95,13 +27,13 @@ nft_do_chain_bridge(void *priv,
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
- nft_bridge_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
break;
case htons(ETH_P_IPV6):
- nft_bridge_set_pktinfo_ipv6(&pkt, skb, state);
+ nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
break;
default:
- nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb, state);
break;
}
@@ -207,12 +139,20 @@ static int __init nf_tables_bridge_init(void)
int ret;
nf_register_afinfo(&nf_br_afinfo);
- nft_register_chain_type(&filter_bridge);
+ ret = nft_register_chain_type(&filter_bridge);
+ if (ret < 0)
+ goto err1;
+
ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
- if (ret < 0) {
- nft_unregister_chain_type(&filter_bridge);
- nf_unregister_afinfo(&nf_br_afinfo);
- }
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ nft_unregister_chain_type(&filter_bridge);
+err1:
+ nf_unregister_afinfo(&nf_br_afinfo);
return ret;
}
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 0b77ffb..4b3df6b 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -14,7 +14,6 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_reject.h>
-#include <net/netfilter/nf_tables_bridge.h>
#include <net/netfilter/ipv4/nf_reject.h>
#include <net/netfilter/ipv6/nf_reject.h>
#include <linux/ip.h>
@@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
skb_pull(nskb, ETH_HLEN);
}
+static int nft_bridge_iphdr_validate(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return 0;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ return 0;
+ else if (len < (iph->ihl*4))
+ return 0;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ return 0;
+
+ return 1;
+}
+
/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
* or the bridge port (NF_BRIDGE PREROUTING).
*/
@@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
br_forward(br_port_get_rcu(dev), nskb, false, true);
}
+static int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+
static void nft_reject_br_send_v6_tcp_reset(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 9dbece2..f1fe26f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4055,12 +4055,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
{
#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
+ int ingress_retval;
+
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- return nf_hook_ingress(skb);
+ rcu_read_lock();
+ ingress_retval = nf_hook_ingress(skb);
+ rcu_read_unlock();
+ return ingress_retval;
}
#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
@@ -5584,6 +5589,7 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
+ u16 ref_nr,
struct list_head *dev_list,
void *private, bool master)
{
@@ -5593,7 +5599,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
- adj->ref_nr++;
+ adj->ref_nr += ref_nr;
return 0;
}
@@ -5603,7 +5609,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->dev = adj_dev;
adj->master = master;
- adj->ref_nr = 1;
+ adj->ref_nr = ref_nr;
adj->private = private;
dev_hold(adj_dev);
@@ -5642,6 +5648,7 @@ free_adj:
static void __netdev_adjacent_dev_remove(struct net_device *dev,
struct net_device *adj_dev,
+ u16 ref_nr,
struct list_head *dev_list)
{
struct netdev_adjacent *adj;
@@ -5654,10 +5661,10 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
BUG();
}
- if (adj->ref_nr > 1) {
- pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
- adj->ref_nr-1);
- adj->ref_nr--;
+ if (adj->ref_nr > ref_nr) {
+ pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
+ ref_nr, adj->ref_nr-ref_nr);
+ adj->ref_nr -= ref_nr;
return;
}
@@ -5676,21 +5683,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
+ u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
{
int ret;
- ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
- master);
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
+ private, master);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
- false);
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
+ private, false);
if (ret) {
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
return ret;
}
@@ -5698,9 +5706,10 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
}
static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev)
+ struct net_device *upper_dev,
+ u16 ref_nr)
{
- return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
&dev->all_adj_list.upper,
&upper_dev->all_adj_list.lower,
NULL, false);
@@ -5708,17 +5717,19 @@ static int __netdev_adjacent_dev_link(struct net_device *dev,
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
+ u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list)
{
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
- __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
+ struct net_device *upper_dev,
+ u16 ref_nr)
{
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
&dev->all_adj_list.upper,
&upper_dev->all_adj_list.lower);
}
@@ -5727,17 +5738,17 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
- int ret = __netdev_adjacent_dev_link(dev, upper_dev);
+ int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower,
private, master);
if (ret) {
- __netdev_adjacent_dev_unlink(dev, upper_dev);
+ __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
return ret;
}
@@ -5747,8 +5758,8 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
- __netdev_adjacent_dev_unlink(dev, upper_dev);
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
}
@@ -5801,7 +5812,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
pr_debug("Interlinking %s with %s, non-neighbour\n",
i->dev->name, j->dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
if (ret)
goto rollback_mesh;
}
@@ -5811,7 +5822,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
pr_debug("linking %s's upper device %s with %s\n",
upper_dev->name, i->dev->name, dev->name);
- ret = __netdev_adjacent_dev_link(dev, i->dev);
+ ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
if (ret)
goto rollback_upper_mesh;
}
@@ -5820,7 +5831,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
pr_debug("linking %s's lower device %s with %s\n", dev->name,
i->dev->name, upper_dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
if (ret)
goto rollback_lower_mesh;
}
@@ -5838,7 +5849,7 @@ rollback_lower_mesh:
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
if (i == to_i)
break;
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
}
i = NULL;
@@ -5848,7 +5859,7 @@ rollback_upper_mesh:
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
if (i == to_i)
break;
- __netdev_adjacent_dev_unlink(dev, i->dev);
+ __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
}
i = j = NULL;
@@ -5860,7 +5871,7 @@ rollback_mesh:
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
if (i == to_i && j == to_j)
break;
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
}
if (i == to_i)
break;
@@ -5940,16 +5951,16 @@ void netdev_upper_dev_unlink(struct net_device *dev,
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list)
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
/* remove also the devices itself from lower/upper device
* list
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list)
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(dev, i->dev);
+ __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
diff --git a/net/core/filter.c b/net/core/filter.c
index 298b146..00351cd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
return err;
}
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+ return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
if (skb_at_tc_ingress(skb))
@@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_STACK_SIZE,
};
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+ .func = bpf_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
u64, from, u64, to, u64, flags)
{
@@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
static const struct bpf_func_proto bpf_csum_diff_proto = {
.func = bpf_csum_diff,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.arg5_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+ /* The interface is to be used in combination with bpf_csum_diff()
+ * for direct packet writes. csum rotation for alignment as well
+ * as emulating csum_sub() can be done from the eBPF program.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ return (skb->csum = csum_add(skb->csum, csum));
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+ .func = bpf_csum_update,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
@@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
struct net_device *dev;
+ struct sk_buff *clone;
+ int ret;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return -EINVAL;
@@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
if (unlikely(!dev))
return -EINVAL;
- skb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb))
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!clone))
return -ENOMEM;
- bpf_push_mac_rcsum(skb);
+ /* For direct write, we need to keep the invariant that the skbs
+ * we're dealing with need to be uncloned. Should uncloning fail
+ * here, we need to free the just generated clone to unclone once
+ * again.
+ */
+ ret = bpf_try_make_head_writable(skb);
+ if (unlikely(ret)) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+
+ bpf_push_mac_rcsum(clone);
return flags & BPF_F_INGRESS ?
- __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+ __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
}
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1716,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+ /* After all direct packet write, this can be used once for
+ * triggering a lazy recalc on next skb_get_hash() invocation.
+ */
+ skb_clear_hash(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+ .func = bpf_set_hash_invalid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
u16, vlan_tci)
{
@@ -2063,19 +2140,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
bool bpf_helper_changes_skb_data(void *func)
{
- if (func == bpf_skb_vlan_push)
- return true;
- if (func == bpf_skb_vlan_pop)
- return true;
- if (func == bpf_skb_store_bytes)
- return true;
- if (func == bpf_skb_change_proto)
- return true;
- if (func == bpf_skb_change_tail)
- return true;
- if (func == bpf_l3_csum_replace)
- return true;
- if (func == bpf_l4_csum_replace)
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_pull_data ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace)
return true;
return false;
@@ -2352,7 +2424,7 @@ BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
struct cgroup *cgrp;
struct sock *sk;
- sk = skb->sk;
+ sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return -ENOENT;
if (unlikely(idx >= array->map.max_entries))
@@ -2440,8 +2512,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
return &bpf_csum_diff_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -2474,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -2491,6 +2569,8 @@ xdp_func_proto(enum bpf_func_id func_id)
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_xdp_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
default:
return sk_filter_func_proto(func_id);
}
@@ -2533,6 +2613,45 @@ static bool sk_filter_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!direct_write)
+ return 0;
+
+ /* if (!skb->cloned)
+ * goto start;
+ *
+ * (Fast-path, otherwise approximation that we might be
+ * a clone, do the rest in helper.)
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+ /* ret = bpf_skb_pull_data(skb, 0); */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_pull_data);
+ /* if (!ret)
+ * goto restore;
+ * return TC_ACT_SHOT;
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+ *insn++ = BPF_EXIT_INSN();
+
+ /* restore: */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ /* start: */
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type,
enum bpf_reg_type *reg_type)
@@ -2810,6 +2929,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
.get_func_proto = tc_cls_act_func_proto,
.is_valid_access = tc_cls_act_is_valid_access,
.convert_ctx_access = tc_cls_act_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
};
static const struct bpf_verifier_ops xdp_ops = {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index bbd118b..5219a9e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2286,7 +2286,7 @@ out:
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
{
- pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev);
+ pkt_dev->pkt_overhead = 0;
pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
@@ -2777,13 +2777,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
- struct pktgen_dev *pkt_dev,
- unsigned int extralen)
+ struct pktgen_dev *pkt_dev)
{
+ unsigned int extralen = LL_RESERVED_SPACE(dev);
struct sk_buff *skb = NULL;
- unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen +
- pkt_dev->pkt_overhead;
+ unsigned int size;
+ size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead;
if (pkt_dev->flags & F_NODE) {
int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
@@ -2796,8 +2796,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
+ /* the caller pre-fetches from skb->data and reserves for the mac hdr */
if (likely(skb))
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, extralen - 16);
return skb;
}
@@ -2830,16 +2831,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- datalen = (odev->hard_header_len + 16) & ~0xf;
-
- skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
prefetchw(skb->data);
- skb_reserve(skb, datalen);
+ skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
eth = (__u8 *) skb_push(skb, 14);
@@ -2959,7 +2958,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- skb = pktgen_alloc_skb(odev, pkt_dev, 16);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 937e459..b06d2f4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += nla_total_size(num_vfs * sizeof(struct nlattr));
size += num_vfs *
(nla_total_size(sizeof(struct ifla_vf_mac)) +
- nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct nlattr)) +
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct ifla_vf_vlan_info)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct nlattr *vfinfo)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct nlattr *vf, *vfstats, *vfvlanlist;
struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_vlan_info vf_vlan_info;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_stats vf_stats;
struct ifla_vf_trust vf_trust;
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
- struct nlattr *vf, *vfstats;
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
@@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
ivi.linkstate = 0;
+ /* VLAN Protocol by default is 802.1Q */
+ ivi.vlan_proto = htons(ETH_P_8021Q);
if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
return 0;
vf_mac.vf =
vf_vlan.vf =
+ vf_vlan_info.vf =
vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
@@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
+ vf_vlan_info.vlan = ivi.vlan;
+ vf_vlan_info.qos = ivi.qos;
+ vf_vlan_info.vlan_proto = ivi.vlan_proto;
vf_tx_rate.rate = ivi.max_tx_rate;
vf_rate.min_tx_rate = ivi.min_tx_rate;
vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_rss_query_en.setting = ivi.rss_query_en;
vf_trust.setting = ivi.trusted;
vf = nla_nest_start(skb, IFLA_VF_INFO);
- if (!vf) {
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vf)
+ goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
&vf_rss_query_en) ||
nla_put(skb, IFLA_VF_TRUST,
sizeof(vf_trust), &vf_trust))
- return -EMSGSIZE;
+ goto nla_put_vf_failure;
+ vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+ if (!vfvlanlist)
+ goto nla_put_vf_failure;
+ if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+ &vf_vlan_info)) {
+ nla_nest_cancel(skb, vfvlanlist);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfvlanlist);
memset(&vf_stats, 0, sizeof(vf_stats));
if (dev->netdev_ops->ndo_get_vf_stats)
dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
&vf_stats);
vfstats = nla_nest_start(skb, IFLA_VF_STATS);
- if (!vfstats) {
- nla_nest_cancel(skb, vf);
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vfstats)
+ goto nla_put_vf_failure;
if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD))
- return -EMSGSIZE;
+ vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
return 0;
+
+nla_put_vf_failure:
+ nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
}
static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
[IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
[IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
@@ -1704,7 +1727,37 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
- ivv->qos);
+ ivv->qos,
+ htons(ETH_P_8021Q));
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN_LIST]) {
+ struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+ struct nlattr *attr;
+ int rem, len = 0;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_vlan)
+ return err;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ return -EINVAL;
+ }
+ if (len >= MAX_VLAN_LIST_LEN)
+ return -EOPNOTSUPP;
+ ivvl[len] = nla_data(attr);
+
+ len++;
+ }
+ if (len == 0)
+ return -EINVAL;
+
+ err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+ ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
return err;
}
@@ -3577,6 +3630,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+ switch (attr_id) {
+ case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+ return sizeof(struct rtnl_link_stats64);
+ }
+
+ return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+ int *prividx)
+{
+ struct nlattr *attr = NULL;
+ int attr_id, size;
+ void *attr_data;
+ int err;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return -ENODATA;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (attr_id < *prividx)
+ continue;
+
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (!size)
+ continue;
+
+ if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ continue;
+
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+ attr_data);
+ if (err)
+ goto get_offload_stats_failure;
+ }
+
+ if (!attr)
+ return -ENODATA;
+
+ *prividx = 0;
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+get_offload_stats_failure:
+ *prividx = attr_id;
+ return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+ int nla_size = 0;
+ int attr_id;
+ int size;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return 0;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ continue;
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (nla_size != 0)
+ nla_size += nla_total_size(0);
+
+ return nla_size;
+}
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3724,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
struct nlmsghdr *nlh;
struct nlattr *attr;
int s_prividx = *prividx;
+ int err;
ASSERT_RTNL();
@@ -3614,8 +3753,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3776,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (master)
ops = master->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3790,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+ *idxattr)) {
+ *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+ attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = rtnl_get_offload_stats(skb, dev, prividx);
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, attr);
+ else
+ nla_nest_end(skb, attr);
+
+ if (err && err != -ENODATA)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -3708,6 +3861,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+ size += rtnl_get_offload_stats_size(dev);
+
return size;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e329d4..cbd19d2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3097,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
- /* GSO partial only requires that we trim off any excess that
- * doesn't fit into an MSS sized block, so take care of that
- * now.
- */
- if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+ if (sg && csum && (mss != GSO_BY_FRAGS)) {
+ if (!(features & NETIF_F_GSO_PARTIAL)) {
+ struct sk_buff *iter;
+
+ if (!list_skb ||
+ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+ goto normal;
+
+ /* Split the buffer at the frag_list pointer.
+ * This is based on the assumption that all
+ * buffers in the chain excluding the last
+ * containing the same amount of data.
+ */
+ skb_walk_frags(head_skb, iter) {
+ if (skb_headlen(iter))
+ goto normal;
+
+ len -= iter->len;
+ }
+ }
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
@@ -3109,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
partial_segs = 0;
}
+normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3300,21 +3321,29 @@ perform_csum_check:
*/
segs->prev = tail;
- /* Update GSO info on first skb in partial sequence. */
if (partial_segs) {
+ struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
+ unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
- type |= SKB_GSO_PARTIAL;
+ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
- skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
- skb_shinfo(segs)->gso_segs = partial_segs;
- skb_shinfo(segs)->gso_type = type;
- SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ for (iter = segs; iter; iter = iter->next) {
+ skb_shinfo(iter)->gso_size = gso_size;
+ skb_shinfo(iter)->gso_segs = partial_segs;
+ skb_shinfo(iter)->gso_type = type;
+ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+ }
+
+ if (tail->len - doffset <= gso_size)
+ skb_shinfo(tail)->gso_size = 0;
+ else if (tail != segs)
+ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
@@ -4493,17 +4522,24 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
}
EXPORT_SYMBOL(skb_ensure_writable);
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
- unsigned int offset = skb->data - skb_mac_header(skb);
+ int offset = skb->data - skb_mac_header(skb);
int err;
- __skb_push(skb, offset);
+ if (WARN_ONCE(offset,
+ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
if (unlikely(err))
- goto pull;
+ return err;
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
@@ -4520,12 +4556,14 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
skb_set_network_header(skb, ETH_HLEN);
skb_reset_mac_len(skb);
-pull:
- __skb_pull(skb, offset);
return err;
}
+EXPORT_SYMBOL(__skb_vlan_pop);
+/* Pop a vlan tag either from hwaccel or from payload.
+ * Expects skb->data at mac header.
+ */
int skb_vlan_pop(struct sk_buff *skb)
{
u16 vlan_tci;
@@ -4535,9 +4573,7 @@ int skb_vlan_pop(struct sk_buff *skb)
if (likely(skb_vlan_tag_present(skb))) {
skb->vlan_tci = 0;
} else {
- if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4545,9 +4581,7 @@ int skb_vlan_pop(struct sk_buff *skb)
return err;
}
/* move next vlan tag to hw accel tag */
- if (likely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (likely(!eth_type_vlan(skb->protocol)))
return 0;
vlan_proto = skb->protocol;
@@ -4560,29 +4594,30 @@ int skb_vlan_pop(struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_vlan_pop);
+/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
+ * Expects skb->data at mac header.
+ */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
if (skb_vlan_tag_present(skb)) {
- unsigned int offset = skb->data - skb_mac_header(skb);
+ int offset = skb->data - skb_mac_header(skb);
int err;
- /* __vlan_insert_tag expect skb->data pointing to mac header.
- * So change skb->data before calling it and change back to
- * original position later
- */
- __skb_push(skb, offset);
+ if (WARN_ONCE(offset,
+ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
err = __vlan_insert_tag(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
- if (err) {
- __skb_pull(skb, offset);
+ if (err)
return err;
- }
skb->protocol = skb->vlan_proto;
skb->mac_len += VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
- __skb_pull(skb, offset);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index 51a7304..038e660 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1340,7 +1340,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
- cgroup_sk_alloc(&sk->sk_cgrp_data);
}
return sk;
@@ -1400,6 +1399,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
+ cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
}
@@ -1544,6 +1544,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(&newsk->sk_cookie, 0);
+
+ cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/stream.c b/net/core/stream.c
index 159516a..1086c8b 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk)
rcu_read_unlock();
}
}
-EXPORT_SYMBOL(sk_stream_write_space);
/**
* sk_stream_wait_connect - Wait for a socket to get into the connected state
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 66e31ac..a6902c1 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -378,9 +378,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
if (ret < 0)
goto out;
- ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
- if (ret < 0)
- goto out;
+ if (ops->set_addr) {
+ ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
+ if (ret < 0)
+ goto out;
+ }
if (!ds->slave_mii_bus && ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(parent);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 8278385..f8a7d9a 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -304,13 +304,11 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
if (err < 0)
return err;
- err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
- if (err < 0)
- return err;
-
- err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
- if (err < 0)
- return err;
+ if (ds->ops->set_addr) {
+ err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
+ if (err < 0)
+ return err;
+ }
if (!ds->slave_mii_bus && ds->ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9ecbe78..6b1282c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -69,6 +69,30 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
return !!p->bridge_dev;
}
+static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+{
+ struct dsa_port *dp = &ds->ports[port];
+
+ if (ds->ops->port_stp_state_set)
+ ds->ops->port_stp_state_set(ds, port, state);
+
+ if (ds->ops->port_fast_age) {
+ /* Fast age FDB entries or flush appropriate forwarding database
+ * for the given port, if we are moving it from Learning or
+ * Forwarding state, to Disabled or Blocking or Listening state.
+ */
+
+ if ((dp->stp_state == BR_STATE_LEARNING ||
+ dp->stp_state == BR_STATE_FORWARDING) &&
+ (state == BR_STATE_DISABLED ||
+ state == BR_STATE_BLOCKING ||
+ state == BR_STATE_LISTENING))
+ ds->ops->port_fast_age(ds, port);
+ }
+
+ dp->stp_state = state;
+}
+
static int dsa_slave_open(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
@@ -104,8 +128,7 @@ static int dsa_slave_open(struct net_device *dev)
goto clear_promisc;
}
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, p->port, stp_state);
+ dsa_port_set_stp_state(ds, p->port, stp_state);
if (p->phy)
phy_start(p->phy);
@@ -147,8 +170,7 @@ static int dsa_slave_close(struct net_device *dev)
if (ds->ops->port_disable)
ds->ops->port_disable(ds, p->port, p->phy);
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, p->port, BR_STATE_DISABLED);
+ dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
return 0;
}
@@ -354,7 +376,7 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
if (switchdev_trans_ph_prepare(trans))
return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
- ds->ops->port_stp_state_set(ds, p->port, attr->u.stp_state);
+ dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
return 0;
}
@@ -556,8 +578,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
* so allow it to be in BR_STATE_FORWARDING to be kept functional
*/
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING);
+ dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
}
static int dsa_slave_port_attr_get(struct net_device *dev,
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b..300b068 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ ---help---
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6..bc6a6c8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_recovery.o \
+ tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e94b47b..1effc98 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- bool udpfrag = false, fixedid = false, encap;
+ bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
@@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (IS_ERR_OR_NULL(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
}
- tot_len = skb_shinfo(skb)->gso_size +
- SKB_GSO_CB(skb)->data_offset +
- skb->head - (unsigned char *)iph;
+
+ if (gso_partial)
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
+ else
+ tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e56a4c..c3b8047 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -182,26 +182,13 @@ static void fib_flush(struct net *net)
struct fib_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
- flushed += fib_table_flush(tb);
+ flushed += fib_table_flush(net, tb);
}
if (flushed)
rt_cache_flush(net);
}
-void fib_flush_external(struct net *net)
-{
- struct fib_table *tb;
- struct hlist_head *head;
- unsigned int h;
-
- for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist)
- fib_table_flush_external(tb);
- }
-}
-
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -590,13 +577,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCDELRT) {
tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = fib_table_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg);
else
err = -ESRCH;
} else {
tb = fib_new_table(net, cfg.fc_table);
if (tb)
- err = fib_table_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg);
else
err = -ENOBUFS;
}
@@ -719,7 +706,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
}
- err = fib_table_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg);
errout:
return err;
}
@@ -741,7 +728,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
}
- err = fib_table_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg);
errout:
return err;
}
@@ -828,9 +815,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
cfg.fc_scope = RT_SCOPE_HOST;
if (cmd == RTM_NEWROUTE)
- fib_table_insert(tb, &cfg);
+ fib_table_insert(net, tb, &cfg);
else
- fib_table_delete(tb, &cfg);
+ fib_table_delete(net, tb, &cfg);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -1254,7 +1241,7 @@ static void ip_fib_net_exit(struct net *net)
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
hlist_del(&tb->tb_hlist);
- fib_table_flush(tb);
+ fib_table_flush(net, tb);
fib_free_table(tb);
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 770bebe..2e50062 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -164,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
+static int call_fib_rule_notifiers(struct net *net,
+ enum fib_event_type event_type)
+{
+ struct fib_notifier_info info;
+
+ return call_fib_notifiers(net, event_type, &info);
+}
+
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
FRA_GENERIC_POLICY,
[FRA_FLOW] = { .type = NLA_U32 },
@@ -220,7 +228,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
- fib_flush_external(rule->fr_net);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD);
err = 0;
errout:
@@ -242,7 +250,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
- fib_flush_external(rule->fr_net);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL);
errout:
return err;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 241f27b..31cef36 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -73,6 +73,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
+#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,10 +81,47 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/switchdev.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
+static BLOCKING_NOTIFIER_HEAD(fib_chain);
+
+int register_fib_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&fib_chain, nb);
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return blocking_notifier_call_chain(&fib_chain, event_type, info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -1076,7 +1114,8 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
}
/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
@@ -1175,17 +1214,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- err = switchdev_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- kmem_cache_free(fn_alias_kmem, new_fa);
- goto out;
- }
-
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1193,6 +1221,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
@@ -1228,30 +1261,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- /* (Optionally) offload fib entry to switch hardware. */
- err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
- cfg->fc_nlflags, tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- goto out_free_new_fa;
- }
-
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_sw_fib_del;
+ goto out_free_new_fa;
if (!plen)
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
+ cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
-out_sw_fib_del:
- switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1490,7 +1515,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
}
/* Caller must hold RTNL. */
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
@@ -1543,9 +1569,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
-
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
+ fa_to_delete->fa_info, tos, cfg->fc_type,
+ tb->tb_id, 0);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1734,82 +1760,8 @@ out:
return NULL;
}
-/* Caller must hold RTNL */
-void fib_table_flush_external(struct fib_table *tb)
-{
- struct trie *t = (struct trie *)tb->tb_data;
- struct key_vector *pn = t->kv;
- unsigned long cindex = 1;
- struct hlist_node *tmp;
- struct fib_alias *fa;
-
- /* walk trie in reverse order */
- for (;;) {
- unsigned char slen = 0;
- struct key_vector *n;
-
- if (!(cindex--)) {
- t_key pkey = pn->key;
-
- /* cannot resize the trie vector */
- if (IS_TRIE(pn))
- break;
-
- /* resize completed node */
- pn = resize(t, pn);
- cindex = get_index(pkey, pn);
-
- continue;
- }
-
- /* grab the next available node */
- n = get_child(pn, cindex);
- if (!n)
- continue;
-
- if (IS_TNODE(n)) {
- /* record pn and cindex for leaf walking */
- pn = n;
- cindex = 1ul << n->bits;
-
- continue;
- }
-
- hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- /* if alias was cloned to local then we just
- * need to remove the local copy from main
- */
- if (tb->tb_id != fa->tb_id) {
- hlist_del_rcu(&fa->fa_list);
- alias_free_mem_rcu(fa);
- continue;
- }
-
- /* record local slen */
- slen = fa->fa_slen;
-
- if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
- continue;
-
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
- }
-
- /* update leaf slen */
- n->slen = slen;
-
- if (hlist_empty(&n->leaf)) {
- put_child_root(pn, n->key, NULL);
- node_free(n);
- }
- }
-}
-
/* Caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
@@ -1858,9 +1810,11 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09..96e0efe 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo;
+ bool need_csum, ufo, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
gre_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4b351af..d6feabb 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ struct net_device *dev = skb->dev;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
+ iph->tos, dev);
if (unlikely(err)) {
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
@@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cc701fa..5d7944f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
u32 orig_mark = skb->mark;
int ret;
@@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(tunnel->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2625332..5f006e1 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
+ unsigned long lastuse;
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
- nla_put_u64_64bit(skb, RTA_EXPIRES,
- jiffies_to_clock_t(c->mfc_un.res.lastuse),
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
return -EMSGSIZE;
@@ -2120,7 +2123,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ipmr_get_route(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr,
- struct rtmsg *rtm, int nowait)
+ struct rtmsg *rtm, int nowait, u32 portid)
{
struct mfc_cache *cache;
struct mr_table *mrt;
@@ -2165,6 +2168,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
return -ENOMEM;
}
+ NETLINK_CB(skb2).portid = portid;
skb_push(skb2, sizeof(struct iphdr));
skb_reset_network_header(skb2);
iph = ip_hdr(skb2);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f993545..7c00ce9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = {
.u = {
.log = {
.level = 4,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 870aebd..713c09a 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -110,7 +110,7 @@ static unsigned int ipv4_helper(void *priv,
if (!help)
return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b5904b..d075b3c 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -149,7 +149,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return -NF_ACCEPT;
}
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 8945c26..b24795e 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 20f2255..8566489 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
else
- logflags = NF_LOG_MASK;
+ logflags = NF_LOG_DEFAULT_MASK;
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
@@ -76,7 +76,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
if (ntohs(ih->frag_off) & IP_OFFSET)
nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
- if ((logflags & XT_LOG_IPOPT) &&
+ if ((logflags & NF_LOG_IPOPT) &&
ih->ihl * 4 > sizeof(struct iphdr)) {
const unsigned char *op;
unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
@@ -250,7 +250,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
}
/* Max length: 15 "UID=4294967295 " */
- if ((logflags & XT_LOG_UID) && !iphoff)
+ if ((logflags & NF_LOG_UID) && !iphoff)
nf_log_dump_sk_uid_gid(m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -282,7 +282,7 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
- if (!(logflags & XT_LOG_MACDECODE))
+ if (!(logflags & NF_LOG_MACDECODE))
goto fallback;
switch (dev->type) {
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 9414923..edf0500 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
- const struct gre_hdr *greh;
- struct gre_hdr_pptp *pgreh;
+ const struct gre_base_hdr *greh;
+ struct pptp_gre_header *pgreh;
/* pgreh includes two optional 32bit fields which are not required
* to be there. That's where the magic '8' comes from */
@@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb,
return false;
greh = (void *)skb->data + hdroff;
- pgreh = (struct gre_hdr_pptp *)greh;
+ pgreh = (struct pptp_gre_header *)greh;
/* we only have destination manip of a packet, since 'source key'
* is not present in the packet itself */
if (maniptype != NF_NAT_MANIP_DST)
return true;
- switch (greh->version) {
- case GRE_VERSION_1701:
+
+ switch (greh->flags & GRE_VERSION) {
+ case GRE_VERSION_0:
/* We do not currently NAT any GREv0 packets.
* Try to behave like "nf_nat_proto_unknown" */
break;
- case GRE_VERSION_PPTP:
+ case GRE_VERSION_1:
pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
pgreh->call_id = tuple->dst.u.gre.key;
break;
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index cd84d42..805c8dd 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv,
{
struct nft_pktinfo pkt;
- nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb, state);
return nft_do_chain(&pkt, priv);
}
@@ -80,7 +80,10 @@ static int __init nf_tables_arp_init(void)
{
int ret;
- nft_register_chain_type(&filter_arp);
+ ret = nft_register_chain_type(&filter_arp);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_arp_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_arp);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index e44ba3b..2840a29 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -103,7 +103,10 @@ static int __init nf_tables_ipv4_init(void)
{
int ret;
- nft_register_chain_type(&filter_ipv4);
+ ret = nft_register_chain_type(&filter_ipv4);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_ipv4);
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 2375b0a..30493be 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
__be32 saddr, daddr;
u_int8_t tos;
const struct iphdr *iph;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv,
tos = iph->tos;
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
- ret = NF_DROP;
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
return ret;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1ed015e..7143ca1 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -46,6 +46,8 @@
#include <net/sock.h>
#include <net/raw.h>
+#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
+
/*
* Report socket allocation statistics [mea@utu.fi]
*/
@@ -356,22 +358,22 @@ static void icmp_put(struct seq_file *seq)
atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
seq_puts(seq, " OutMsgs OutErrors");
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu %lu",
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + icmpmibmap[i].index));
seq_printf(seq, " %lu %lu",
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
- for (i = 0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
}
@@ -379,14 +381,16 @@ static void icmp_put(struct seq_file *seq)
/*
* Called from the PROCfs module. This outputs /proc/net/snmp.
*/
-static int snmp_seq_show(struct seq_file *seq, void *v)
+static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
- int i;
struct net *net = seq->private;
+ u64 buff64[IPSTATS_MIB_MAX];
+ int i;
- seq_puts(seq, "Ip: Forwarding DefaultTTL");
+ memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
- for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_puts(seq, "Ip: Forwarding DefaultTTL");
+ for (i = 0; snmp4_ipstats_list[i].name; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
@@ -394,57 +398,77 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
net->ipv4.sysctl_ip_default_ttl);
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
- for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
- seq_printf(seq, " %llu",
- snmp_fold_field64(net->mib.ip_statistics,
- snmp4_ipstats_list[i].entry,
- offsetof(struct ipstats_mib, syncp)));
+ snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; snmp4_ipstats_list[i].name; i++)
+ seq_printf(seq, " %llu", buff64[i]);
- icmp_put(seq); /* RFC 2011 compatibility */
- icmpmsg_put(seq);
+ return 0;
+}
+
+static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
+{
+ unsigned long buff[TCPUDP_MIB_MAX];
+ struct net *net = seq->private;
+ int i;
+
+ memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+ for (i = 0; snmp4_tcp_list[i].name; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+ snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
+ net->mib.tcp_statistics);
+ for (i = 0; snmp4_tcp_list[i].name; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
- seq_printf(seq, " %ld",
- snmp_fold_field(net->mib.tcp_statistics,
- snmp4_tcp_list[i].entry));
+ seq_printf(seq, " %ld", buff[i]);
else
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.tcp_statistics,
- snmp4_tcp_list[i].entry));
+ seq_printf(seq, " %lu", buff[i]);
}
+ memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+
+ snmp_get_cpu_field_batch(buff, snmp4_udp_list,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ for (i = 0; snmp4_udp_list[i].name; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.udp_statistics,
- snmp4_udp_list[i].entry));
+ for (i = 0; snmp4_udp_list[i].name; i++)
+ seq_printf(seq, " %lu", buff[i]);
+
+ memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ snmp_get_cpu_field_batch(buff, snmp4_udp_list,
+ net->mib.udplite_statistics);
+ for (i = 0; snmp4_udp_list[i].name; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.udplite_statistics,
- snmp4_udp_list[i].entry));
+ for (i = 0; snmp4_udp_list[i].name; i++)
+ seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
return 0;
}
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+ snmp_seq_show_ipstats(seq, v);
+
+ icmp_put(seq); /* RFC 2011 compatibility */
+ icmpmsg_put(seq);
+
+ snmp_seq_show_tcp_udp(seq, v);
+
+ return 0;
+}
+
static int snmp_seq_open(struct inode *inode, struct file *file)
{
return single_open_net(inode, file, snmp_seq_show);
@@ -469,21 +493,21 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
struct net *net = seq->private;
seq_puts(seq, "TcpExt:");
- for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ for (i = 0; snmp4_net_list[i].name; i++)
seq_printf(seq, " %s", snmp4_net_list[i].name);
seq_puts(seq, "\nTcpExt:");
- for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ for (i = 0; snmp4_net_list[i].name; i++)
seq_printf(seq, " %lu",
snmp_fold_field(net->mib.net_statistics,
snmp4_net_list[i].entry));
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ for (i = 0; snmp4_ipextstats_list[i].name; i++)
seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ for (i = 0; snmp4_ipextstats_list[i].name; i++)
seq_printf(seq, " %llu",
snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipextstats_list[i].entry,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b52496f..f2be689 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
- u32 delta = 0;
+ u32 new, delta = 0;
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, p_id) - segs;
+ /* Do not use atomic_add_return() as it makes UBSAN unhappy */
+ do {
+ old = (u32)atomic_read(p_id);
+ new = old + delta + segs;
+ } while (atomic_cmpxchg(p_id, old, new) != old);
+
+ return new - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -2494,7 +2500,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
int err = ipmr_get_route(net, skb,
fl4->saddr, fl4->daddr,
- r, nowait);
+ r, nowait, portid);
+
if (err <= 0) {
if (!nowait) {
if (err == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7dae800..f253e50 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- tp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
*/
tp->snd_cwnd = TCP_INIT_CWND;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
@@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
+
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
@@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
@@ -2704,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp;
+ u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64;
@@ -2794,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
+
+ info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+ rate = READ_ONCE(tp->rate_delivered);
+ intv = READ_ONCE(tp->rate_interval_us);
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ put_unaligned(rate64, &info->tcpi_delivery_rate);
+ }
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3261,11 +3278,12 @@ static void __init tcp_init_mem(void)
void __init tcp_init(void)
{
- unsigned long limit;
int max_rshare, max_wshare, cnt;
+ unsigned long limit;
unsigned int i;
- sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 0000000..0ea66c2
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ * On each ACK, update our model of the network path:
+ * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ * min_rtt = windowed_min(rtt, 10 seconds)
+ * pacing_rate = pacing_gain * bottleneck_bandwidth
+ * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
+ u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ restore_cwnd:1, /* decided to revert cwnd to old value */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ tso_segs_goal:7, /* segments we want in each skb we send */
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ unused:5,
+ lt_is_sampling:1, /* taking long-term ("LT") samples now? */
+ lt_rtt_cnt:7, /* round trips in long-term interval */
+ lt_use_bw:1; /* use lt_bw as our bw estimate? */
+ u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
+ u32 lt_last_delivered; /* LT intvl start: tp->delivered */
+ u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
+ u32 lt_last_lost; /* LT intvl start: tp->lost */
+ u32 pacing_gain:10, /* current gain for setting pacing rate */
+ cwnd_gain:10, /* current gain for setting cwnd */
+ full_bw_cnt:3, /* number of rounds without large bw gains */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ unused_b:6;
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+ rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC;
+ return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+ sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 min_segs;
+
+ min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+ 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
+ bbr->idle_restart = 1;
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
+ * need more speed (we're restarting from idle and app-limited).
+ */
+ if (bbr->mode == BBR_PROBE_BW)
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ }
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd;
+ u64 w;
+
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
+ * default. This should only happen when the connection is not using TCP
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+ * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ */
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
+ return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+
+ w = (u64)bw * bbr->min_rtt_us;
+
+ /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd += 3 * bbr->tso_segs_goal;
+
+ /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+ cwnd = (cwnd + 1) & ~1U;
+
+ return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+ struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+ u32 cwnd = tp->snd_cwnd;
+
+ /* An ACK for P pkts should release at most 2*P packets. We do this
+ * in two steps. First, here we deduct the number of lost packets.
+ * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+ */
+ if (rs->losses > 0)
+ cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+ if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+ /* Starting 1st round of Recovery, so do packet conservation. */
+ bbr->packet_conservation = 1;
+ bbr->next_rtt_delivered = tp->delivered; /* start round now */
+ /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+ cwnd = tcp_packets_in_flight(tp) + acked;
+ } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+ /* Exiting loss recovery; restore cwnd saved before recovery. */
+ bbr->restore_cwnd = 1;
+ bbr->packet_conservation = 0;
+ }
+ bbr->prev_ca_state = state;
+
+ if (bbr->restore_cwnd) {
+ /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+ cwnd = max(cwnd, bbr->prior_cwnd);
+ bbr->restore_cwnd = 0;
+ }
+
+ if (bbr->packet_conservation) {
+ *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+ return true; /* yes, using packet conservation */
+ }
+ *new_cwnd = cwnd;
+ return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ u32 acked, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd = 0, target_cwnd = 0;
+
+ if (!acked)
+ return;
+
+ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+ goto done;
+
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
+ target_cwnd = bbr_target_cwnd(sk, bw, gain);
+ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
+ cwnd = min(cwnd + acked, target_cwnd);
+ else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+ cwnd = cwnd + acked;
+ cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
+ tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool is_full_length =
+ skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+ bbr->min_rtt_us;
+ u32 inflight, bw;
+
+ /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+ * use the pipe without increasing the queue.
+ */
+ if (bbr->pacing_gain == BBR_UNIT)
+ return is_full_length; /* just use wall clock time */
+
+ inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ bw = bbr_max_bw(sk);
+
+ /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+ * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+ * small (e.g. on a LAN). We do not persist if packets are lost, since
+ * a path with small buffers may not hold that much.
+ */
+ if (bbr->pacing_gain > BBR_UNIT)
+ return is_full_length &&
+ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
+ inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+ /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+ * probing didn't find more bw. If inflight falls to match BDP then we
+ * estimate queue is drained; persisting would underutilize the pipe.
+ */
+ return is_full_length ||
+ inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+ bbr->cycle_mstamp = tp->delivered_mstamp;
+ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+ bbr_is_next_cycle_phase(sk, rs))
+ bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_STARTUP;
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_PROBE_BW;
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+ bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+ if (!bbr_full_bw_reached(sk))
+ bbr_reset_startup_mode(sk);
+ else
+ bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+ bbr->lt_last_delivered = tp->delivered;
+ bbr->lt_last_lost = tp->lost;
+ bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_bw = 0;
+ bbr->lt_use_bw = 0;
+ bbr->lt_is_sampling = false;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 diff;
+
+ if (bbr->lt_bw) { /* do we have bw from a previous interval? */
+ /* Is new bw close to the lt_bw from the previous interval? */
+ diff = abs(bw - bbr->lt_bw);
+ if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+ (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+ bbr_lt_bw_diff)) {
+ /* All criteria are met; estimate we're policed. */
+ bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
+ bbr->lt_use_bw = 1;
+ bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
+ bbr->lt_rtt_cnt = 0;
+ return;
+ }
+ }
+ bbr->lt_bw = bw;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 lost, delivered;
+ u64 bw;
+ s32 t;
+
+ if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
+ if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+ ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
+ bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
+ }
+ return;
+ }
+
+ /* Wait for the first loss before sampling, to let the policer exhaust
+ * its tokens and estimate the steady-state rate allowed by the policer.
+ * Starting samples earlier includes bursts that over-estimate the bw.
+ */
+ if (!bbr->lt_is_sampling) {
+ if (!rs->losses)
+ return;
+ bbr_reset_lt_bw_sampling_interval(sk);
+ bbr->lt_is_sampling = true;
+ }
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (rs->is_app_limited) {
+ bbr_reset_lt_bw_sampling(sk);
+ return;
+ }
+
+ if (bbr->round_start)
+ bbr->lt_rtt_cnt++; /* count round trips in this interval */
+ if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+ return; /* sampling interval needs to be longer */
+ if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* interval is too long */
+ return;
+ }
+
+ /* End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (!rs->losses)
+ return;
+
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = tp->lost - bbr->lt_last_lost;
+ delivered = tp->delivered - bbr->lt_last_delivered;
+ /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+ if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+ return;
+
+ /* Find average delivery rate in this sampling interval. */
+ t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+ if (t < 1)
+ return; /* interval is less than one jiffy, so wait */
+ t = jiffies_to_usecs(t);
+ /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+ if (t < 1) {
+ bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
+ return;
+ }
+ bw = (u64)delivered * BW_UNIT;
+ do_div(bw, t);
+ bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->round_start = 0;
+ if (rs->delivered < 0 || rs->interval_us <= 0)
+ return; /* Not a valid observation */
+
+ /* See if we've reached the next RTT */
+ if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->rtt_cnt++;
+ bbr->round_start = 1;
+ bbr->packet_conservation = 0;
+ }
+
+ bbr_lt_bw_sampling(sk, rs);
+
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
+ * ratio will be <<1 for most connections. So delivered is first scaled.
+ */
+ bw = (u64)rs->delivered * BW_UNIT;
+ do_div(bw, rs->interval_us);
+
+ /* If this sample is application-limited, it is likely to have a very
+ * low delivered count that represents application behavior rather than
+ * the available network rate. Such a sample could drag down estimated
+ * bw, causing needless slow-down. Thus, to continue to send at the
+ * last measured network rate, we filter out app-limited samples unless
+ * they describe the path bw at least as well as our bw model.
+ *
+ * So the goal during app-limited phase is to proceed with the best
+ * network rate no matter how long. We automatically leave this
+ * phase when app writes faster than the network can deliver :)
+ */
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+ /* Incorporate new sample into our max bw filter. */
+ minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+ }
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw_thresh;
+
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+ return;
+
+ bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+ if (bbr_max_bw(sk) >= bw_thresh) {
+ bbr->full_bw = bbr_max_bw(sk);
+ bbr->full_bw_cnt = 0;
+ return;
+ }
+ ++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
+ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
+ } /* fall through to check if in-flight is already small: */
+ if (bbr->mode == BBR_DRAIN &&
+ tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool filter_expired;
+
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
+ filter_expired = after(tcp_time_stamp,
+ bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+ if (rs->rtt_us >= 0 &&
+ (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ bbr->min_rtt_us = rs->rtt_us;
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ }
+
+ if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
+ bbr->probe_rtt_done_stamp = 0;
+ }
+
+ if (bbr->mode == BBR_PROBE_RTT) {
+ /* Ignore low rate samples during this mode. */
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
+ if (!bbr->probe_rtt_done_stamp &&
+ tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+ bbr->probe_rtt_done_stamp = tcp_time_stamp +
+ msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+ bbr->probe_rtt_round_done = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ } else if (bbr->probe_rtt_done_stamp) {
+ if (bbr->round_start)
+ bbr->probe_rtt_round_done = 1;
+ if (bbr->probe_rtt_round_done &&
+ after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ bbr->restore_cwnd = 1; /* snap to prior_cwnd */
+ bbr_reset_mode(sk);
+ }
+ }
+ }
+ bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+ bbr_update_bw(sk, rs);
+ bbr_update_cycle_phase(sk, rs);
+ bbr_check_full_bw_reached(sk, rs);
+ bbr_check_drain(sk, rs);
+ bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw;
+
+ bbr_update_model(sk, rs);
+
+ bw = bbr_bw(sk);
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+ bbr_set_tso_segs_goal(sk);
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->prior_cwnd = 0;
+ bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
+ bbr->rtt_cnt = 0;
+ bbr->next_rtt_delivered = 0;
+ bbr->prev_ca_state = TCP_CA_Open;
+ bbr->packet_conservation = 0;
+
+ bbr->probe_rtt_done_stamp = 0;
+ bbr->probe_rtt_round_done = 0;
+ bbr->min_rtt_us = tcp_min_rtt(tp);
+ bbr->min_rtt_stamp = tcp_time_stamp;
+
+ minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
+
+ /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+ bw = (u64)tp->snd_cwnd * BW_UNIT;
+ do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+ sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */
+ bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+ bbr->restore_cwnd = 0;
+ bbr->round_start = 0;
+ bbr->idle_restart = 0;
+ bbr->full_bw = 0;
+ bbr->full_bw_cnt = 0;
+ bbr->cycle_mstamp.v64 = 0;
+ bbr->cycle_idx = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+ return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+ bbr_save_cwnd(sk);
+ return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw = bbr_bw(sk);
+
+ bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+ memset(&info->bbr, 0, sizeof(info->bbr));
+ info->bbr.bbr_bw_lo = (u32)bw;
+ info->bbr.bbr_bw_hi = (u32)(bw >> 32);
+ info->bbr.bbr_min_rtt = bbr->min_rtt_us;
+ info->bbr.bbr_pacing_gain = bbr->pacing_gain;
+ info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
+ *attr = INET_DIAG_BBRINFO;
+ return sizeof(info->bbr);
+ }
+ return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ struct rate_sample rs = { .losses = 1 };
+
+ bbr->prev_ca_state = TCP_CA_Loss;
+ bbr->full_bw = 0;
+ bbr->round_start = 1; /* treat RTO like end of a round */
+ bbr_lt_bw_sampling(sk, &rs);
+ }
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "bbr",
+ .owner = THIS_MODULE,
+ .init = bbr_init,
+ .cong_control = bbr_main,
+ .sndbuf_expand = bbr_sndbuf_expand,
+ .undo_cwnd = bbr_undo_cwnd,
+ .cwnd_event = bbr_cwnd_event,
+ .ssthresh = bbr_ssthresh,
+ .tso_segs_goal = bbr_tso_segs_goal,
+ .get_info = bbr_get_info,
+ .set_state = bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 03725b2..35b2803 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
module_param(use_tolerance, bool, 0644);
MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
-struct minmax {
+struct cdg_minmax {
union {
struct {
s32 min;
@@ -74,10 +74,10 @@ enum cdg_state {
};
struct cdg {
- struct minmax rtt;
- struct minmax rtt_prev;
- struct minmax *gradients;
- struct minmax gsum;
+ struct cdg_minmax rtt;
+ struct cdg_minmax rtt_prev;
+ struct cdg_minmax *gradients;
+ struct cdg_minmax gsum;
bool gfilled;
u8 tail;
u8 state;
@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct minmax *gradients;
+ struct cdg_minmax *gradients;
switch (ev) {
case CA_EVENT_CWND_RESTART:
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4..1294af4 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
int ret = 0;
/* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !ca->cong_avoid) {
+ if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dad3e7e..a27b9c0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
- sndmem = 2 * nr_segs * per_mss;
+ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+ sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ * and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ * and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!(sacked & TCPCB_LOST) ||
+ ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+ tp->lost += tcp_skb_pcount(skb);
+}
+
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);
tp->lost_out += tcp_skb_pcount(skb);
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
+ tcp_sum_lost(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct rate_sample *rate;
int flag;
};
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
+ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+ TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack,
tcp_skb_pcount(skb),
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
- if (found_dup_sack)
+ if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
+ tp->delivered++; /* A spurious retransmission is delivered */
+ }
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
+ bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+ mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ is_reneg);
+ if (mark_lost)
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ if (mark_lost) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -2329,10 +2362,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- &np->daddr, ntohs(inet->inet_dport),
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2503,6 +2535,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (inet_csk(sk)->icsk_ca_ops->cong_control)
+ return;
+
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2879,67 +2914,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
*rexmit = REXMIT_LOST;
}
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
- const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
- struct rtt_meas *m = tcp_sk(sk)->rtt_min;
- struct rtt_meas rttm = {
- .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
- .ts = now,
- };
- u32 elapsed;
-
- /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
- if (unlikely(rttm.rtt <= m[0].rtt))
- m[0] = m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[1].rtt)
- m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[2].rtt)
- m[2] = rttm;
-
- elapsed = now - m[0].ts;
- if (unlikely(elapsed > wlen)) {
- /* Passed entire window without a new min so make 2nd choice
- * the new min & 3rd choice the new 2nd. So forth and so on.
- */
- m[0] = m[1];
- m[1] = m[2];
- m[2] = rttm;
- if (now - m[0].ts > wlen) {
- m[0] = m[1];
- m[1] = rttm;
- if (now - m[0].ts > wlen)
- m[0] = rttm;
- }
- } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
- /* Passed a quarter of the window without a new min so
- * take 2nd choice from the 2nd quarter of the window.
- */
- m[2] = m[1] = rttm;
- } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
- /* Passed half the window without a new min so take the 3rd
- * choice from the last half of the window.
- */
- m[2] = rttm;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+ minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+ rtt_us ? : jiffies_to_usecs(1));
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3102,10 +3083,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack)
+ struct tcp_sacktag_state *sack,
+ struct skb_mstamp *now)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct skb_mstamp first_ackt, last_ackt, now;
+ struct skb_mstamp first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3137,7 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
-
fully_acked = false;
} else {
/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3173,6 +3154,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
+ tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -3205,16 +3187,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- skb_mstamp_get(&now);
if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
- seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
}
if (sack->first_sackt.v64) {
- sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+ sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
}
-
+ sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
@@ -3242,7 +3223,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3333,8 +3314,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
- int flag)
+ int flag, const struct rate_sample *rs)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->cong_control) {
+ icsk->icsk_ca_ops->cong_control(sk, rs);
+ return;
+ }
+
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3579,17 +3567,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
+ struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- u32 prior_delivered = tp->delivered;
+ u32 delivered = tp->delivered;
+ u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
+ sack_state.rate = &rs;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3612,6 +3604,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ skb_mstamp_get(&now);
+
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -3622,6 +3616,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
}
prior_fackets = tp->fackets_out;
+ rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3677,7 +3672,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ &sack_state, &now);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3694,7 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ lost = tp->lost - lost; /* freshly marked lost */
+ tcp_rate_gen(sk, delivered, lost, &now, &rs);
+ tcp_cong_control(sk, ack, delivered, flag, &rs);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -5951,7 +5949,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* so release it.
*/
if (req) {
- tp->total_retrans = req->num_retrans;
+ inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
@@ -5993,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else
tcp_init_metrics(sk);
- tcp_update_pacing_rate(sk);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
@@ -6326,6 +6325,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
+ inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 04b9893..7ac37c3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1196,7 +1196,6 @@ static void tcp_v4_init_req(struct request_sock *req,
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->no_srccheck = inet_sk(sk_listener)->transparent;
ireq->opt = tcp_v4_save_options(skb);
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f63c73d..6234eba 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- newtp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ newtp->app_limited = ~0U;
+
tcp_init_xmit_timers(newsk);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c59649..bc68da3 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
- /* GSO partial only requires splitting the frame into an MSS
- * multiple and possibly a remainder. So update the mss now.
- */
- if (features & NETIF_F_GSO_PARTIAL)
- mss = skb->len - (skb->len % mss);
-
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
delta = htonl(oldlen + (thlen + mss));
skb = segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8b45794..896e9df 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
- TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->lost_out > tp->retrans_out &&
+ tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tcp_xmit_retransmit_queue(sk);
+
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
+ }
}
/*
* One tasklet per cpu tries to send more skbs.
@@ -918,6 +925,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
+ tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1221,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
+ /* Update delivered info for the new segment */
+ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
@@ -1358,6 +1369,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
}
return mtu;
}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
@@ -1545,7 +1557,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
{
u32 bytes, segs;
@@ -1557,10 +1570,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
- segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+ segs = max_t(u32, bytes / mss_now, min_tso_segs);
return min_t(u32, segs, sk->sk_gso_max_segs);
}
+EXPORT_SYMBOL(tcp_tso_autosize);
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+ return tso_segs ? :
+ tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
@@ -1966,12 +1992,14 @@ static int tcp_mtu_probe(struct sock *sk)
len = 0;
tcp_for_write_queue_from_safe(skb, next, sk) {
copy = min_t(int, skb->len, probe_size - len);
- if (nskb->ip_summed)
+ if (nskb->ip_summed) {
skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
- else
- nskb->csum = skb_copy_and_csum_bits(skb, 0,
- skb_put(nskb, copy),
- copy, nskb->csum);
+ } else {
+ __wsum csum = skb_copy_and_csum_bits(skb, 0,
+ skb_put(nskb, copy),
+ copy, 0);
+ nskb->csum = csum_block_add(nskb->csum, csum, len);
+ }
if (skb->len <= copy) {
/* We've eaten all the data from this skb.
@@ -2020,6 +2048,39 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+ unsigned int factor)
+{
+ unsigned int limit;
+
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+ limit <<= factor;
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ return true;
+ }
+ return false;
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2057,7 +2118,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
- max_segs = tcp_tso_autosize(sk, mss_now);
+ max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -2106,29 +2167,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- /* TCP Small Queues :
- * Control number of packets in qdisc/devices to two packets / or ~1 ms.
- * This allows for :
- * - better RTT estimation and ACK scheduling
- * - faster recovery
- * - high rates
- * Alas, some drivers / subsystems require a fair amount
- * of queued bytes to ensure line rate.
- * One example is wifi aggregation (802.11 AMPDU)
- */
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
- limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
- if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- /* It is possible TX completion already happened
- * before we set TSQ_THROTTLED, so we must
- * test again the condition.
- */
- smp_mb__after_atomic();
- if (atomic_read(&sk->sk_wmem_alloc) > limit)
- break;
- }
+ if (tcp_small_queue_check(sk, skb, 0))
+ break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2605,7 +2645,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
* copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
- min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
+ sk->sk_sndbuf))
return -EAGAIN;
if (skb_still_in_host_queue(sk, skb))
@@ -2774,7 +2815,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
- max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+ max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked;
int segs;
@@ -2828,10 +2869,13 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
+ if (tcp_small_queue_check(sk, skb, 1))
+ return;
+
if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
@@ -3568,6 +3612,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk)))
+ tcp_sk(sk)->total_retrans++;
}
return res;
}
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 0000000..9be1581
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,186 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ tp->delivered_mstamp = skb->skb_mstamp;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (!scb->tx.delivered_mstamp.v64)
+ return;
+
+ if (!rs->prior_delivered ||
+ after(scb->tx.delivered, rs->prior_delivered)) {
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = skb_mstamp_us_delta(
+ &skb->skb_mstamp,
+ &scb->tx.first_tx_mstamp);
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ struct skb_mstamp *now, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = *now;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available. */
+ if (!rs->prior_mstamp.v64) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d84930b..3ea1cf8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -192,6 +192,8 @@ static int tcp_write_timeout(struct sock *sk)
if (tp->syn_data && icsk->icsk_retransmits == 1)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ } else if (!tp->syn_data && !tp->syn_fastopen) {
+ sk_rethink_txhash(sk);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
@@ -213,6 +215,8 @@ static int tcp_write_timeout(struct sock *sk)
tcp_mtu_probing(icsk, sk);
dst_negative_advice(sk);
+ } else {
+ sk_rethink_txhash(sk);
}
retry_until = net->ipv4.sysctl_tcp_retries2;
@@ -384,6 +388,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
+ icsk->icsk_retransmits++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b..f9333c9 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__be16 new_protocol, bool is_ipv6)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool remcsum, need_csum, offload_csum, ufo;
+ bool remcsum, need_csum, offload_csum, ufo, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* will be using a length value equal to only one MSS sized
* segment instead of the entire frame.
*/
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
uh->len = htons(skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)uh);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2f1f5d4..cbd9343 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp)
return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}
+static inline s32 rfc3315_s14_backoff_init(s32 irt)
+{
+ /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
+ u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+ do_div(tmp, 1000000);
+ return (s32)tmp;
+}
+
+static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
+{
+ /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
+ u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+ do_div(tmp, 1000000);
+ if ((s32)tmp > mrt) {
+ /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
+ tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+ do_div(tmp, 1000000);
+ }
+ return (s32)tmp;
+}
+
#ifdef CONFIG_SYSCTL
static int addrconf_sysctl_register(struct inet6_dev *idev);
static void addrconf_sysctl_unregister(struct inet6_dev *idev);
@@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -232,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -3687,7 +3710,7 @@ static void addrconf_rs_timer(unsigned long data)
if (idev->if_flags & IF_RA_RCVD)
goto out;
- if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
+ if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
write_unlock(&idev->lock);
if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
ndisc_send_rs(dev, &lladdr,
@@ -3696,11 +3719,13 @@ static void addrconf_rs_timer(unsigned long data)
goto put;
write_lock(&idev->lock);
+ idev->rs_interval = rfc3315_s14_backoff_update(
+ idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
/* The wait after the last probe can be shorter */
addrconf_mod_rs_timer(idev, (idev->rs_probes ==
idev->cnf.rtr_solicits) ?
idev->cnf.rtr_solicit_delay :
- idev->cnf.rtr_solicit_interval);
+ idev->rs_interval);
} else {
/*
* Note: we do not support deprecated "all on-link"
@@ -3949,7 +3974,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
send_rs = send_mld &&
ipv6_accept_ra(ifp->idev) &&
- ifp->idev->cnf.rtr_solicits > 0 &&
+ ifp->idev->cnf.rtr_solicits != 0 &&
(dev->flags&IFF_LOOPBACK) == 0;
read_unlock_bh(&ifp->idev->lock);
@@ -3971,10 +3996,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
write_lock_bh(&ifp->idev->lock);
spin_lock(&ifp->lock);
+ ifp->idev->rs_interval = rfc3315_s14_backoff_init(
+ ifp->idev->cnf.rtr_solicit_interval);
ifp->idev->rs_probes = 1;
ifp->idev->if_flags |= IF_RS_SENT;
- addrconf_mod_rs_timer(ifp->idev,
- ifp->idev->cnf.rtr_solicit_interval);
+ addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
spin_unlock(&ifp->lock);
write_unlock_bh(&ifp->idev->lock);
}
@@ -4891,6 +4917,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
array[DEVCONF_RTR_SOLICIT_INTERVAL] =
jiffies_to_msecs(cnf->rtr_solicit_interval);
+ array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
+ jiffies_to_msecs(cnf->rtr_solicit_max_interval);
array[DEVCONF_RTR_SOLICIT_DELAY] =
jiffies_to_msecs(cnf->rtr_solicit_delay);
array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
@@ -4961,18 +4989,18 @@ static inline size_t inet6_if_nlmsg_size(void)
}
static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
- int items, int bytes)
+ int bytes)
{
int i;
- int pad = bytes - sizeof(u64) * items;
+ int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX;
BUG_ON(pad < 0);
/* Use put_unaligned() because stats may not be aligned for u64. */
- put_unaligned(items, &stats[0]);
- for (i = 1; i < items; i++)
+ put_unaligned(ICMP6_MIB_MAX, &stats[0]);
+ for (i = 1; i < ICMP6_MIB_MAX; i++)
put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
- memset(&stats[items], 0, pad);
+ memset(&stats[ICMP6_MIB_MAX], 0, pad);
}
static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
@@ -5005,7 +5033,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
offsetof(struct ipstats_mib, syncp));
break;
case IFLA_INET6_ICMP6STATS:
- __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
+ __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes);
break;
}
}
@@ -5099,7 +5127,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
return -EINVAL;
if (!ipv6_accept_ra(idev))
return -EINVAL;
- if (idev->cnf.rtr_solicits <= 0)
+ if (idev->cnf.rtr_solicits == 0)
return -EINVAL;
write_lock_bh(&idev->lock);
@@ -5128,8 +5156,10 @@ update_lft:
if (update_rs) {
idev->if_flags |= IF_RS_SENT;
+ idev->rs_interval = rfc3315_s14_backoff_init(
+ idev->cnf.rtr_solicit_interval);
idev->rs_probes = 1;
- addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval);
+ addrconf_mod_rs_timer(idev, idev->rs_interval);
}
/* Well, that's kinda nasty ... */
@@ -5467,20 +5497,6 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
}
static
-int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table lctl;
- int min_hl = 1, max_hl = 255;
-
- lctl = *ctl;
- lctl.extra1 = &min_hl;
- lctl.extra2 = &max_hl;
-
- return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
-}
-
-static
int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -5713,6 +5729,9 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
return ret;
}
+static const int one = 1;
+static const int two_five_five = 255;
+
static const struct ctl_table addrconf_sysctl[] = {
{
.procname = "forwarding",
@@ -5726,7 +5745,9 @@ static const struct ctl_table addrconf_sysctl[] = {
.data = &ipv6_devconf.hop_limit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = addrconf_sysctl_hop_limit,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ .extra2 = (void *)&two_five_five,
},
{
.procname = "mtu",
@@ -5778,6 +5799,13 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
+ .procname = "router_solicitation_max_interval",
+ .data = &ipv6_devconf.rtr_solicit_max_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
.procname = "router_solicitation_delay",
.data = &ipv6_devconf.rtr_solicit_delay,
.maxlen = sizeof(int),
@@ -6044,8 +6072,14 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
for (i = 0; table[i].data; i++) {
table[i].data += (char *)p - (char *)&ipv6_devconf;
- table[i].extra1 = idev; /* embedded; no ref */
- table[i].extra2 = net;
+ /* If one of these is already set, then it is not safe to
+ * overwrite either of them: this makes proc_dointvec_minmax
+ * usable.
+ */
+ if (!table[i].extra1 && !table[i].extra2) {
+ table[i].extra1 = idev; /* embedded; no ref */
+ table[i].extra2 = net;
+ }
}
snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 397e1ed..d7d6d3a 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -648,7 +648,6 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
encap_limit = t->parms.encap_limit;
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = skb->protocol;
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
@@ -1239,7 +1238,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
if (data[IFLA_GRE_FLOWINFO])
- parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]);
+ parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
if (data[IFLA_GRE_FLAGS])
parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e5..e7bfd55 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
int offset = 0;
bool encap, udpfrag;
int nhoff;
+ bool gso_partial;
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (IS_ERR(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
for (skb = segs; skb; skb = skb->next) {
ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
- if (skb_is_gso(skb))
+ if (gso_partial)
payload_len = skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)(ipv6h + 1);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index cc7e058..8a02ca8 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb)
goto discard;
}
- XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-
rcu_read_unlock();
- return xfrm6_rcv(skb);
+ return xfrm6_rcv_tnl(skb, t);
}
rcu_read_unlock();
return -EINVAL;
@@ -340,6 +338,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
u32 orig_mark = skb->mark;
int ret;
@@ -357,7 +356,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(t->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 6122f9c..7f4265b 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2239,6 +2239,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
+ unsigned long lastuse;
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2269,12 +2270,14 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
- nla_put_u64_64bit(skb, RTA_EXPIRES,
- jiffies_to_clock_t(c->mfc_un.res.lastuse),
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
return -EMSGSIZE;
@@ -2282,8 +2285,8 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
return 1;
}
-int ip6mr_get_route(struct net *net,
- struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
+ int nowait, u32 portid)
{
int err;
struct mr6_table *mrt;
@@ -2328,6 +2331,7 @@ int ip6mr_get_route(struct net *net,
return -ENOMEM;
}
+ NETLINK_CB(skb2).portid = portid;
skb_reset_transport_header(skb2);
skb_put(skb2, sizeof(struct ipv6hdr));
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 552fac2..55aacea 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_WARNING,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 1aa5848..963ee38 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -115,7 +115,7 @@ static unsigned int ipv6_helper(void *priv,
help = nfct_help(ct);
if (!help)
return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 660bc10..f5a61bc 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -165,7 +165,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
return -NF_ACCEPT;
}
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index c1bcf69..57d8606 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
else
- logflags = NF_LOG_MASK;
+ logflags = NF_LOG_DEFAULT_MASK;
ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
if (ih == NULL) {
@@ -84,7 +84,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
}
/* Max length: 48 "OPT (...) " */
- if (logflags & XT_LOG_IPOPT)
+ if (logflags & NF_LOG_IPOPT)
nf_log_buf_add(m, "OPT ( ");
switch (currenthdr) {
@@ -121,7 +121,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
if (fragment) {
- if (logflags & XT_LOG_IPOPT)
+ if (logflags & NF_LOG_IPOPT)
nf_log_buf_add(m, ")");
return;
}
@@ -129,7 +129,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
break;
/* Max Length */
case IPPROTO_AH:
- if (logflags & XT_LOG_IPOPT) {
+ if (logflags & NF_LOG_IPOPT) {
struct ip_auth_hdr _ahdr;
const struct ip_auth_hdr *ah;
@@ -161,7 +161,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
hdrlen = (hp->hdrlen+2)<<2;
break;
case IPPROTO_ESP:
- if (logflags & XT_LOG_IPOPT) {
+ if (logflags & NF_LOG_IPOPT) {
struct ip_esp_hdr _esph;
const struct ip_esp_hdr *eh;
@@ -194,7 +194,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
return;
}
- if (logflags & XT_LOG_IPOPT)
+ if (logflags & NF_LOG_IPOPT)
nf_log_buf_add(m, ") ");
currenthdr = hp->nexthdr;
@@ -277,7 +277,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
}
/* Max length: 15 "UID=4294967295 " */
- if ((logflags & XT_LOG_UID) && recurse)
+ if ((logflags & NF_LOG_UID) && recurse)
nf_log_dump_sk_uid_gid(m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
@@ -295,7 +295,7 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
if (info->type == NF_LOG_TYPE_LOG)
logflags = info->u.log.logflags;
- if (!(logflags & XT_LOG_MACDECODE))
+ if (!(logflags & NF_LOG_MACDECODE))
goto fallback;
switch (dev->type) {
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 30b22f4..d6e4ba5 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
{
struct nft_pktinfo pkt;
- /* malformed packet, drop it */
- if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
- return NF_DROP;
+ nft_set_pktinfo_ipv6(&pkt, skb, state);
return nft_do_chain(&pkt, priv);
}
@@ -102,7 +100,10 @@ static int __init nf_tables_ipv6_init(void)
{
int ret;
- nft_register_chain_type(&filter_ipv6);
+ ret = nft_register_chain_type(&filter_ipv6);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_ipv6);
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 71d995f..f272747 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -31,10 +31,9 @@ static unsigned int nf_route_table_hook(void *priv,
struct in6_addr saddr, daddr;
u_int8_t hop_limit;
u32 mark, flowlabel;
+ int err;
- /* malformed packet, drop it */
- if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
- return NF_DROP;
+ nft_set_pktinfo_ipv6(&pkt, skb, state);
/* save source/dest address, mark, hoplimit, flowlabel, priority */
memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
@@ -46,13 +45,16 @@ static unsigned int nf_route_table_hook(void *priv,
flowlabel = *((u32 *)ipv6_hdr(skb));
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE &&
+ if (ret != NF_DROP && ret != NF_STOLEN &&
(memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
- flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
- return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP;
+ flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+ err = ip6_route_me_harder(state->net, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
return ret;
}
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 679253d0..cc8e3ae 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -30,6 +30,11 @@
#include <net/transp_v6.h>
#include <net/ipv6.h>
+#define MAX4(a, b, c, d) \
+ max_t(u32, max_t(u32, a, b), max_t(u32, c, d))
+#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
+ IPSTATS_MIB_MAX, ICMP_MIB_MAX)
+
static int sockstat6_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq->private;
@@ -191,25 +196,34 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
atomic_long_t *smib,
const struct snmp_mib *itemlist)
{
+ unsigned long buff[SNMP_MIB_MAX];
int i;
- unsigned long val;
- for (i = 0; itemlist[i].name; i++) {
- val = pcpumib ?
- snmp_fold_field(pcpumib, itemlist[i].entry) :
- atomic_long_read(smib + itemlist[i].entry);
- seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val);
+ if (pcpumib) {
+ memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+
+ snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
+ for (i = 0; itemlist[i].name; i++)
+ seq_printf(seq, "%-32s\t%lu\n",
+ itemlist[i].name, buff[i]);
+ } else {
+ for (i = 0; itemlist[i].name; i++)
+ seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
+ atomic_long_read(smib + itemlist[i].entry));
}
}
static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
const struct snmp_mib *itemlist, size_t syncpoff)
{
+ u64 buff64[SNMP_MIB_MAX];
int i;
+ memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+
+ snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
for (i = 0; itemlist[i].name; i++)
- seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name,
- snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
+ seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
}
static int snmp6_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff..bdbc38e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
}
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
- struct net_device *dev,
- struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+ struct net_device *dev,
+ struct flowi6 *fl6, int flags)
{
if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
void ip6_route_input(struct sk_buff *skb)
{
@@ -1991,9 +1992,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
- if (cfg->fc_table)
+ if (cfg->fc_table) {
grt = ip6_nh_lookup_table(net, cfg, gw_addr);
+ if (grt) {
+ if (grt->rt6i_flags & RTF_GATEWAY ||
+ (dev && dev != grt->dst.dev)) {
+ ip6_rt_put(grt);
+ grt = NULL;
+ }
+ }
+ }
+
if (!grt)
grt = rt6_lookup(net, gw_addr, NULL,
cfg->fc_ifindex, 1);
@@ -3206,7 +3216,9 @@ static int rt6_fill_node(struct net *net,
if (iif) {
#ifdef CONFIG_IPV6_MROUTE
if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
- int err = ip6mr_get_route(net, skb, rtm, nowait);
+ int err = ip6mr_get_route(net, skb, rtm, nowait,
+ portid);
+
if (err <= 0) {
if (!nowait) {
if (err == 0)
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00a2d40..b578956 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm6_extract_header(skb);
}
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+ struct ip6_tnl *t)
{
- XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
return xfrm_input(skb, nexthdr, spi, 0);
@@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
return -1;
}
-int xfrm6_rcv(struct sk_buff *skb)
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
{
return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
- 0);
+ 0, t);
}
-EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_rcv_tnl);
+int xfrm6_rcv(struct sk_buff *skb)
+{
+ return xfrm6_rcv_tnl(skb, NULL);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
xfrm_address_t *saddr, u8 proto)
{
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044..e1c0bbe 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
__be32 spi;
spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
- return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+ return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
}
static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index db63969..391c3cb 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
struct sock *sk = sock->sk;
struct irda_sock *new, *self = irda_sk(sk);
struct sock *newsk;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
int err;
err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
@@ -897,7 +897,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
err = -EPERM; /* value does not seem to make sense. -arnd */
if (!new->tsap) {
pr_debug("%s(), dup failed!\n", __func__);
- kfree_skb(skb);
goto out;
}
@@ -916,7 +915,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
/* Clean up the original one to keep it in listen state */
irttp_listen(self->tsap);
- kfree_skb(skb);
sk->sk_ack_backlog--;
newsock->state = SS_CONNECTED;
@@ -924,6 +922,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
irda_connect_response(new);
err = 0;
out:
+ kfree_skb(skb);
release_sock(sk);
return err;
}
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index a5d69df..f6749dc 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -261,10 +261,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
.timeout = timeout,
.ssn = start_seq_num,
};
-
int i, ret = -EOPNOTSUPP;
u16 status = WLAN_STATUS_REQUEST_DECLINED;
+ if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
+ ht_dbg(sta->sdata,
+ "STA %pM requests BA session on unsupported tid %d\n",
+ sta->sta.addr, tid);
+ goto end_no_lock;
+ }
+
if (!sta->sta.ht_cap.ht_supported) {
ht_dbg(sta->sdata,
"STA %pM erroneously requests BA session on tid %d w/o QoS\n",
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 5650c46..45319cc 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -584,6 +584,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW))
return -EINVAL;
+ if (WARN_ON(tid >= IEEE80211_FIRST_TSPEC_TSID))
+ return -EINVAL;
+
ht_dbg(sdata, "Open BA session requested for %pM tid %u\n",
pubsta->addr, tid);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8f8bddd..34c2add 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1232,7 +1232,7 @@ struct ieee80211_local {
spinlock_t tim_lock;
unsigned long num_sta;
struct list_head sta_list;
- struct rhashtable sta_hash;
+ struct rhltable sta_hash;
struct timer_list sta_cleanup;
int sta_generation;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index fa7d37c..b747c96 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -757,6 +757,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
sta = next_hop_deref_protected(mpath);
if (mpath->flags & MESH_PATH_ACTIVE &&
ether_addr_equal(ta, sta->sta.addr) &&
+ !(mpath->flags & MESH_PATH_FIXED) &&
(!(mpath->flags & MESH_PATH_SN_VALID) ||
SN_GT(target_sn, mpath->sn) || target_sn == 0)) {
mpath->flags &= ~MESH_PATH_ACTIVE;
@@ -1023,7 +1024,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
goto enddiscovery;
spin_lock_bh(&mpath->state_lock);
- if (mpath->flags & MESH_PATH_DELETED) {
+ if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) {
spin_unlock_bh(&mpath->state_lock);
goto enddiscovery;
}
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 6db2ddf..f0e6175 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
mpath->metric = 0;
mpath->hop_count = 0;
mpath->exp_time = 0;
- mpath->flags |= MESH_PATH_FIXED;
+ mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID;
mesh_path_activate(mpath);
spin_unlock_bh(&mpath->state_lock);
mesh_path_tx_pending(mpath);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b2fe725..6175db3 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4004,7 +4004,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
- struct rhash_head *tmp;
+ struct rhlist_head *tmp;
int err = 0;
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -4047,13 +4047,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
goto out;
} else if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
- const struct bucket_table *tbl;
prev_sta = NULL;
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+ for_each_sta_info(local, hdr->addr2, sta, tmp) {
if (!prev_sta) {
prev_sta = sta;
continue;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 167bff0..78e9ecb 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
- .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
- .hashfn = sta_addr_hash,
.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
};
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
static int sta_info_hash_del(struct ieee80211_local *local,
struct sta_info *sta)
{
- return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhltable_remove(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
sta_info_free(local, sta);
}
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+ const u8 *addr)
+{
+ return rhltable_lookup(&local->sta_hash, addr, sta_rht_params);
+}
+
/* protected by RCU */
struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
rcu_read_lock();
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata) {
rcu_read_unlock();
/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
rcu_read_lock();
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata ||
(sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
rcu_read_unlock();
@@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
static int sta_info_hash_add(struct ieee80211_local *local,
struct sta_info *sta)
{
- return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhltable_insert(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void sta_deliver_ps_frames(struct work_struct *wk)
@@ -453,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta)
is_multicast_ether_addr(sta->sta.addr)))
return -EINVAL;
- /* Strictly speaking this isn't necessary as we hold the mutex, but
- * the rhashtable code can't really deal with that distinction. We
- * do require the mutex for correctness though.
+ /* The RCU read lock is required by rhashtable due to
+ * asynchronous resize/rehash. We also require the mutex
+ * for correctness.
*/
rcu_read_lock();
lockdep_assert_held(&sdata->local->sta_mtx);
@@ -1043,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data)
round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
}
-u32 sta_addr_hash(const void *key, u32 length, u32 seed)
-{
- return jhash(key, ETH_ALEN, seed);
-}
-
int sta_info_init(struct ieee80211_local *local)
{
int err;
- err = rhashtable_init(&local->sta_hash, &sta_rht_params);
+ err = rhltable_init(&local->sta_hash, &sta_rht_params);
if (err)
return err;
@@ -1068,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local)
void sta_info_stop(struct ieee80211_local *local)
{
del_timer_sync(&local->sta_cleanup);
- rhashtable_destroy(&local->sta_hash);
+ rhltable_destroy(&local->sta_hash);
}
@@ -1138,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
const u8 *localaddr)
{
struct ieee80211_local *local = hw_to_local(hw);
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
-
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
/*
* Just return a random station if localaddr is NULL
* ... first in list.
*/
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (localaddr &&
!ether_addr_equal(sta->sdata->vif.addr, localaddr))
continue;
@@ -1617,7 +1607,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
sta_info_recalc_tim(sta);
} else {
- unsigned long tids = sta->txq_buffered_tids & driver_release_tids;
int tid;
/*
@@ -1647,7 +1636,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
return;
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
- if (!(tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid]))
+ if (!(driver_release_tids & BIT(tid)) ||
+ txq_has_queue(sta->sta.txq[tid]))
continue;
sta_info_recalc_tim(sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 530231b..ed5fcb9 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -455,7 +455,7 @@ struct sta_info {
/* General information, mostly static */
struct list_head list, free_list;
struct rcu_head rcu_head;
- struct rhash_head hash_node;
+ struct rhlist_head hash_node;
u8 addr[ETH_ALEN];
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
@@ -638,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
*/
#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+ const u8 *addr);
+
/*
* Get a STA info, must be under RCU read lock.
*/
@@ -647,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr);
-u32 sta_addr_hash(const void *key, u32 length, u32 seed);
-
-#define _sta_bucket_idx(_tbl, _a) \
- rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd))
-
-#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \
- rht_for_each_entry_rcu(_sta, _tmp, tbl, \
- _sta_bucket_idx(tbl, _addr), \
- hash_node) \
- /* compare address and run code only if it matches */ \
- if (ether_addr_equal(_sta->addr, (_addr)))
+#define for_each_sta_info(local, _addr, _sta, _tmp) \
+ rhl_for_each_entry_rcu(_sta, _tmp, \
+ sta_info_hash_lookup(local, _addr), hash_node)
/*
* Get STA info by index, BROKEN!
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index ea39f8a7..ddf71c6 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -746,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
__le16 fc;
struct ieee80211_supported_band *sband;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
int retry_count;
int rates_idx;
bool send_to_cooked;
@@ -755,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
- const struct bucket_table *tbl;
rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
@@ -764,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
sband = local->hw.wiphy->bands[info->band];
fc = hdr->frame_control;
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) {
+ for_each_sta_info(local, hdr->addr1, sta, tmp) {
/* skip wrong virtual interface */
if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
continue;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 0ea1b0d..1c56abc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3447,13 +3447,17 @@ begin:
skb_queue_splice_tail(&tx.skbs, &txqi->frags);
}
+ if (skb && skb_has_frag_list(skb) &&
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
+ if (skb_linearize(skb)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ goto begin;
+ }
+ }
+
out:
spin_unlock_bh(&fq->lock);
- if (skb && skb_has_frag_list(skb) &&
- !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
- skb_linearize(skb);
-
return skb;
}
EXPORT_SYMBOL(ieee80211_tx_dequeue);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 7079cd3..06019db 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
/* TODO check this */
SET_NETDEV_DEV(ndev, &local->phy->dev);
+ dev_net_set(ndev, wpan_phy_net(local->hw.phy));
sdata = netdev_priv(ndev);
ndev->ieee802154_ptr = &sdata->wpan_dev;
memcpy(sdata->name, ndev->name, IFNAMSIZ);
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 446e130..4dcf6e1 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -101,11 +101,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
sdata->dev->stats.rx_bytes += skb->len;
switch (mac_cb(skb)->type) {
+ case IEEE802154_FC_TYPE_BEACON:
+ case IEEE802154_FC_TYPE_ACK:
+ case IEEE802154_FC_TYPE_MAC_CMD:
+ goto fail;
+
case IEEE802154_FC_TYPE_DATA:
return ieee802154_deliver_skb(skb);
default:
- pr_warn("ieee802154: bad frame received (type = %d)\n",
- mac_cb(skb)->type);
+ pr_warn_ratelimited("ieee802154: bad frame received "
+ "(type = %d)\n", mac_cb(skb)->type);
goto fail;
}
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 732a5c1..bdfef6c 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -1,9 +1,6 @@
#ifndef MPLS_INTERNAL_H
#define MPLS_INTERNAL_H
-
-struct mpls_shim_hdr {
- __be32 label_stack_entry;
-};
+#include <net/mpls.h>
struct mpls_entry_decoded {
u32 label;
@@ -93,11 +90,6 @@ struct mpls_route { /* next hop label forwarding entry */
#define endfor_nexthops(rt) }
-static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
-{
- return (struct mpls_shim_hdr *)skb_network_header(skb);
-}
-
static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
{
struct mpls_shim_hdr result;
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 33738c0..13290a7 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -170,6 +170,7 @@ struct ncsi_package;
#define NCSI_PACKAGE_SHIFT 5
#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7)
+#define NCSI_RESERVED_CHANNEL 0x1f
#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c))
@@ -186,9 +187,15 @@ struct ncsi_channel {
struct ncsi_channel_mode modes[NCSI_MODE_MAX];
struct ncsi_channel_filter *filters[NCSI_FILTER_MAX];
struct ncsi_channel_stats stats;
- struct timer_list timer; /* Link monitor timer */
- bool enabled; /* Timer is enabled */
- unsigned int timeout; /* Times of timeout */
+ struct {
+ struct timer_list timer;
+ bool enabled;
+ unsigned int state;
+#define NCSI_CHANNEL_MONITOR_START 0
+#define NCSI_CHANNEL_MONITOR_RETRY 1
+#define NCSI_CHANNEL_MONITOR_WAIT 2
+#define NCSI_CHANNEL_MONITOR_WAIT_MAX 5
+ } monitor;
struct list_head node;
struct list_head link;
};
@@ -206,7 +213,8 @@ struct ncsi_package {
struct ncsi_request {
unsigned char id; /* Request ID - 0 to 255 */
bool used; /* Request that has been assigned */
- bool driven; /* Drive state machine */
+ unsigned int flags; /* NCSI request property */
+#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
struct ncsi_dev_priv *ndp; /* Associated NCSI device */
struct sk_buff *cmd; /* Associated NCSI command packet */
struct sk_buff *rsp; /* Associated NCSI response packet */
@@ -258,6 +266,7 @@ struct ncsi_dev_priv {
struct list_head packages; /* List of packages */
struct ncsi_request requests[256]; /* Request table */
unsigned int request_id; /* Last used request ID */
+#define NCSI_REQ_START_IDX 1
unsigned int pending_req_num; /* Number of pending requests */
struct ncsi_package *active_package; /* Currently handled package */
struct ncsi_channel *active_channel; /* Currently handled channel */
@@ -274,7 +283,7 @@ struct ncsi_cmd_arg {
unsigned char package; /* Destination package ID */
unsigned char channel; /* Detination channel ID or 0x1f */
unsigned short payload; /* Command packet payload length */
- bool driven; /* Drive the state machine? */
+ unsigned int req_flags; /* NCSI request properties */
union {
unsigned char bytes[16]; /* Command packet specific data */
unsigned short words[8];
@@ -313,7 +322,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
unsigned char id,
struct ncsi_package **np,
struct ncsi_channel **nc);
-struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven);
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
+ unsigned int req_flags);
void ncsi_free_request(struct ncsi_request *nr);
struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index d463468..b41a661 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -53,7 +53,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
struct ncsi_aen_lsc_pkt *lsc;
struct ncsi_channel *nc;
struct ncsi_channel_mode *ncm;
- unsigned long old_data;
+ bool chained;
+ int state;
+ unsigned long old_data, data;
unsigned long flags;
/* Find the NCSI channel */
@@ -62,20 +64,27 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
return -ENODEV;
/* Update the link status */
- ncm = &nc->modes[NCSI_MODE_LINK];
lsc = (struct ncsi_aen_lsc_pkt *)h;
+
+ spin_lock_irqsave(&nc->lock, flags);
+ ncm = &nc->modes[NCSI_MODE_LINK];
old_data = ncm->data[2];
- ncm->data[2] = ntohl(lsc->status);
+ data = ntohl(lsc->status);
+ ncm->data[2] = data;
ncm->data[4] = ntohl(lsc->oem_status);
- if (!((old_data ^ ncm->data[2]) & 0x1) ||
- !list_empty(&nc->link))
+
+ chained = !list_empty(&nc->link);
+ state = nc->state;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ if (!((old_data ^ data) & 0x1) || chained)
return 0;
- if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) &&
- !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1)))
+ if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) &&
+ !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
return 0;
if (!(ndp->flags & NCSI_DEV_HWA) &&
- nc->state == NCSI_CHANNEL_ACTIVE)
+ state == NCSI_CHANNEL_ACTIVE)
ndp->flags |= NCSI_DEV_RESHUFFLE;
ncsi_stop_channel_monitor(nc);
@@ -97,13 +106,21 @@ static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp,
if (!nc)
return -ENODEV;
+ spin_lock_irqsave(&nc->lock, flags);
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_ACTIVE)
+ nc->state != NCSI_CHANNEL_ACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
return 0;
+ }
+ spin_unlock_irqrestore(&nc->lock, flags);
ncsi_stop_channel_monitor(nc);
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
spin_lock_irqsave(&ndp->lock, flags);
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ nc->state = NCSI_CHANNEL_INACTIVE;
list_add_tail_rcu(&nc->link, &ndp->channel_queue);
spin_unlock_irqrestore(&ndp->lock, flags);
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 21057a8..db7083b 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -272,7 +272,7 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca)
struct sk_buff *skb;
struct ncsi_request *nr;
- nr = ncsi_alloc_request(ndp, nca->driven);
+ nr = ncsi_alloc_request(ndp, nca->req_flags);
if (!nr)
return NULL;
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index ef017b8..5e509e5 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -132,6 +132,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
struct ncsi_dev *nd = &ndp->ndev;
struct ncsi_package *np;
struct ncsi_channel *nc;
+ unsigned long flags;
nd->state = ncsi_dev_state_functional;
if (force_down) {
@@ -142,14 +143,21 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
nd->link_up = 0;
NCSI_FOR_EACH_PACKAGE(ndp, np) {
NCSI_FOR_EACH_CHANNEL(np, nc) {
+ spin_lock_irqsave(&nc->lock, flags);
+
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_ACTIVE)
+ nc->state != NCSI_CHANNEL_ACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
continue;
+ }
if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+ spin_unlock_irqrestore(&nc->lock, flags);
nd->link_up = 1;
goto report;
}
+
+ spin_unlock_irqrestore(&nc->lock, flags);
}
}
@@ -163,43 +171,55 @@ static void ncsi_channel_monitor(unsigned long data)
struct ncsi_package *np = nc->package;
struct ncsi_dev_priv *ndp = np->ndp;
struct ncsi_cmd_arg nca;
- bool enabled;
- unsigned int timeout;
+ bool enabled, chained;
+ unsigned int monitor_state;
unsigned long flags;
- int ret;
+ int state, ret;
spin_lock_irqsave(&nc->lock, flags);
- timeout = nc->timeout;
- enabled = nc->enabled;
+ state = nc->state;
+ chained = !list_empty(&nc->link);
+ enabled = nc->monitor.enabled;
+ monitor_state = nc->monitor.state;
spin_unlock_irqrestore(&nc->lock, flags);
- if (!enabled || !list_empty(&nc->link))
+ if (!enabled || chained)
return;
- if (nc->state != NCSI_CHANNEL_INACTIVE &&
- nc->state != NCSI_CHANNEL_ACTIVE)
+ if (state != NCSI_CHANNEL_INACTIVE &&
+ state != NCSI_CHANNEL_ACTIVE)
return;
- if (!(timeout % 2)) {
+ switch (monitor_state) {
+ case NCSI_CHANNEL_MONITOR_START:
+ case NCSI_CHANNEL_MONITOR_RETRY:
nca.ndp = ndp;
nca.package = np->id;
nca.channel = nc->id;
nca.type = NCSI_PKT_CMD_GLS;
- nca.driven = false;
+ nca.req_flags = 0;
ret = ncsi_xmit_cmd(&nca);
if (ret) {
netdev_err(ndp->ndev.dev, "Error %d sending GLS\n",
ret);
return;
}
- }
- if (timeout + 1 >= 3) {
+ break;
+ case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX:
+ break;
+ default:
if (!(ndp->flags & NCSI_DEV_HWA) &&
- nc->state == NCSI_CHANNEL_ACTIVE)
+ state == NCSI_CHANNEL_ACTIVE) {
ncsi_report_link(ndp, true);
+ ndp->flags |= NCSI_DEV_RESHUFFLE;
+ }
+
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
spin_lock_irqsave(&ndp->lock, flags);
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ nc->state = NCSI_CHANNEL_INACTIVE;
list_add_tail_rcu(&nc->link, &ndp->channel_queue);
spin_unlock_irqrestore(&ndp->lock, flags);
ncsi_process_next_channel(ndp);
@@ -207,10 +227,9 @@ static void ncsi_channel_monitor(unsigned long data)
}
spin_lock_irqsave(&nc->lock, flags);
- nc->timeout = timeout + 1;
- nc->enabled = true;
+ nc->monitor.state++;
spin_unlock_irqrestore(&nc->lock, flags);
- mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+ mod_timer(&nc->monitor.timer, jiffies + HZ);
}
void ncsi_start_channel_monitor(struct ncsi_channel *nc)
@@ -218,12 +237,12 @@ void ncsi_start_channel_monitor(struct ncsi_channel *nc)
unsigned long flags;
spin_lock_irqsave(&nc->lock, flags);
- WARN_ON_ONCE(nc->enabled);
- nc->timeout = 0;
- nc->enabled = true;
+ WARN_ON_ONCE(nc->monitor.enabled);
+ nc->monitor.enabled = true;
+ nc->monitor.state = NCSI_CHANNEL_MONITOR_START;
spin_unlock_irqrestore(&nc->lock, flags);
- mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2)));
+ mod_timer(&nc->monitor.timer, jiffies + HZ);
}
void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
@@ -231,14 +250,14 @@ void ncsi_stop_channel_monitor(struct ncsi_channel *nc)
unsigned long flags;
spin_lock_irqsave(&nc->lock, flags);
- if (!nc->enabled) {
+ if (!nc->monitor.enabled) {
spin_unlock_irqrestore(&nc->lock, flags);
return;
}
- nc->enabled = false;
+ nc->monitor.enabled = false;
spin_unlock_irqrestore(&nc->lock, flags);
- del_timer_sync(&nc->timer);
+ del_timer_sync(&nc->monitor.timer);
}
struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
@@ -267,8 +286,9 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
nc->id = id;
nc->package = np;
nc->state = NCSI_CHANNEL_INACTIVE;
- nc->enabled = false;
- setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc);
+ nc->monitor.enabled = false;
+ setup_timer(&nc->monitor.timer,
+ ncsi_channel_monitor, (unsigned long)nc);
spin_lock_init(&nc->lock);
INIT_LIST_HEAD(&nc->link);
for (index = 0; index < NCSI_CAP_MAX; index++)
@@ -405,7 +425,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp,
* be same. Otherwise, the bogus response might be replied. So
* the available IDs are allocated in round-robin fashion.
*/
-struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
+struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
+ unsigned int req_flags)
{
struct ncsi_request *nr = NULL;
int i, limit = ARRAY_SIZE(ndp->requests);
@@ -413,30 +434,31 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven)
/* Check if there is one available request until the ceiling */
spin_lock_irqsave(&ndp->lock, flags);
- for (i = ndp->request_id; !nr && i < limit; i++) {
+ for (i = ndp->request_id; i < limit; i++) {
if (ndp->requests[i].used)
continue;
nr = &ndp->requests[i];
nr->used = true;
- nr->driven = driven;
- if (++ndp->request_id >= limit)
- ndp->request_id = 0;
+ nr->flags = req_flags;
+ ndp->request_id = i + 1;
+ goto found;
}
/* Fail back to check from the starting cursor */
- for (i = 0; !nr && i < ndp->request_id; i++) {
+ for (i = NCSI_REQ_START_IDX; i < ndp->request_id; i++) {
if (ndp->requests[i].used)
continue;
nr = &ndp->requests[i];
nr->used = true;
- nr->driven = driven;
- if (++ndp->request_id >= limit)
- ndp->request_id = 0;
+ nr->flags = req_flags;
+ ndp->request_id = i + 1;
+ goto found;
}
- spin_unlock_irqrestore(&ndp->lock, flags);
+found:
+ spin_unlock_irqrestore(&ndp->lock, flags);
return nr;
}
@@ -458,7 +480,7 @@ void ncsi_free_request(struct ncsi_request *nr)
nr->cmd = NULL;
nr->rsp = NULL;
nr->used = false;
- driven = nr->driven;
+ driven = !!(nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN);
spin_unlock_irqrestore(&ndp->lock, flags);
if (driven && cmd && --ndp->pending_req_num == 0)
@@ -508,10 +530,11 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
struct ncsi_package *np = ndp->active_package;
struct ncsi_channel *nc = ndp->active_channel;
struct ncsi_cmd_arg nca;
+ unsigned long flags;
int ret;
nca.ndp = ndp;
- nca.driven = true;
+ nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
case ncsi_dev_state_suspend:
nd->state = ncsi_dev_state_suspend_select;
@@ -527,7 +550,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
nca.package = np->id;
if (nd->state == ncsi_dev_state_suspend_select) {
nca.type = NCSI_PKT_CMD_SP;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
if (ndp->flags & NCSI_DEV_HWA)
nca.bytes[0] = 0;
else
@@ -544,7 +567,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
nd->state = ncsi_dev_state_suspend_deselect;
} else if (nd->state == ncsi_dev_state_suspend_deselect) {
nca.type = NCSI_PKT_CMD_DP;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
nd->state = ncsi_dev_state_suspend_done;
}
@@ -556,7 +579,9 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
break;
case ncsi_dev_state_suspend_done:
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ spin_lock_irqsave(&nc->lock, flags);
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
ncsi_process_next_channel(ndp);
break;
@@ -574,10 +599,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
struct ncsi_channel *nc = ndp->active_channel;
struct ncsi_cmd_arg nca;
unsigned char index;
+ unsigned long flags;
int ret;
nca.ndp = ndp;
- nca.driven = true;
+ nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
case ncsi_dev_state_config:
case ncsi_dev_state_config_sp:
@@ -590,7 +616,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
else
nca.bytes[0] = 1;
nca.package = np->id;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
@@ -675,10 +701,12 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
goto error;
break;
case ncsi_dev_state_config_done:
+ spin_lock_irqsave(&nc->lock, flags);
if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1)
- xchg(&nc->state, NCSI_CHANNEL_ACTIVE);
+ nc->state = NCSI_CHANNEL_ACTIVE;
else
- xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
ncsi_start_channel_monitor(nc);
ncsi_process_next_channel(ndp);
@@ -707,18 +735,25 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
found = NULL;
NCSI_FOR_EACH_PACKAGE(ndp, np) {
NCSI_FOR_EACH_CHANNEL(np, nc) {
+ spin_lock_irqsave(&nc->lock, flags);
+
if (!list_empty(&nc->link) ||
- nc->state != NCSI_CHANNEL_INACTIVE)
+ nc->state != NCSI_CHANNEL_INACTIVE) {
+ spin_unlock_irqrestore(&nc->lock, flags);
continue;
+ }
if (!found)
found = nc;
ncm = &nc->modes[NCSI_MODE_LINK];
if (ncm->data[2] & 0x1) {
+ spin_unlock_irqrestore(&nc->lock, flags);
found = nc;
goto out;
}
+
+ spin_unlock_irqrestore(&nc->lock, flags);
}
}
@@ -797,7 +832,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
int ret;
nca.ndp = ndp;
- nca.driven = true;
+ nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
switch (nd->state) {
case ncsi_dev_state_probe:
nd->state = ncsi_dev_state_probe_deselect;
@@ -807,7 +842,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
/* Deselect all possible packages */
nca.type = NCSI_PKT_CMD_DP;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
for (index = 0; index < 8; index++) {
nca.package = index;
ret = ncsi_xmit_cmd(&nca);
@@ -823,7 +858,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
/* Select all possible packages */
nca.type = NCSI_PKT_CMD_SP;
nca.bytes[0] = 1;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
for (index = 0; index < 8; index++) {
nca.package = index;
ret = ncsi_xmit_cmd(&nca);
@@ -876,7 +911,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
nca.type = NCSI_PKT_CMD_SP;
nca.bytes[0] = 1;
nca.package = ndp->active_package->id;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
@@ -884,12 +919,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
nd->state = ncsi_dev_state_probe_cis;
break;
case ncsi_dev_state_probe_cis:
- ndp->pending_req_num = 32;
+ ndp->pending_req_num = NCSI_RESERVED_CHANNEL;
/* Clear initial state */
nca.type = NCSI_PKT_CMD_CIS;
nca.package = ndp->active_package->id;
- for (index = 0; index < 0x20; index++) {
+ for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) {
nca.channel = index;
ret = ncsi_xmit_cmd(&nca);
if (ret)
@@ -933,7 +968,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
/* Deselect the active package */
nca.type = NCSI_PKT_CMD_DP;
nca.package = ndp->active_package->id;
- nca.channel = 0x1f;
+ nca.channel = NCSI_RESERVED_CHANNEL;
ret = ncsi_xmit_cmd(&nca);
if (ret)
goto error;
@@ -987,11 +1022,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
goto out;
}
- old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE);
list_del_init(&nc->link);
-
spin_unlock_irqrestore(&ndp->lock, flags);
+ spin_lock_irqsave(&nc->lock, flags);
+ old_state = nc->state;
+ nc->state = NCSI_CHANNEL_INVISIBLE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
ndp->active_channel = nc;
ndp->active_package = nc->package;
@@ -1006,7 +1044,7 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp)
break;
default:
netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n",
- nc->state, nc->package->id, nc->id);
+ old_state, nc->package->id, nc->id);
ncsi_report_link(ndp, false);
return -EINVAL;
}
@@ -1070,7 +1108,7 @@ static int ncsi_inet6addr_event(struct notifier_block *this,
return NOTIFY_OK;
nca.ndp = ndp;
- nca.driven = false;
+ nca.req_flags = 0;
nca.package = np->id;
nca.channel = nc->id;
nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
@@ -1118,7 +1156,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
/* Initialize private NCSI device */
spin_lock_init(&ndp->lock);
INIT_LIST_HEAD(&ndp->packages);
- ndp->request_id = 0;
+ ndp->request_id = NCSI_REQ_START_IDX;
for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
ndp->requests[i].id = i;
ndp->requests[i].ndp = ndp;
@@ -1149,9 +1187,7 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev);
int ncsi_start_dev(struct ncsi_dev *nd)
{
struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
- struct ncsi_package *np;
- struct ncsi_channel *nc;
- int old_state, ret;
+ int ret;
if (nd->state != ncsi_dev_state_registered &&
nd->state != ncsi_dev_state_functional)
@@ -1163,15 +1199,6 @@ int ncsi_start_dev(struct ncsi_dev *nd)
return 0;
}
- /* Reset channel's state and start over */
- NCSI_FOR_EACH_PACKAGE(ndp, np) {
- NCSI_FOR_EACH_CHANNEL(np, nc) {
- old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE);
- WARN_ON_ONCE(!list_empty(&nc->link) ||
- old_state == NCSI_CHANNEL_INVISIBLE);
- }
- }
-
if (ndp->flags & NCSI_DEV_HWA)
ret = ncsi_enable_hwa(ndp);
else
@@ -1181,6 +1208,35 @@ int ncsi_start_dev(struct ncsi_dev *nd)
}
EXPORT_SYMBOL_GPL(ncsi_start_dev);
+void ncsi_stop_dev(struct ncsi_dev *nd)
+{
+ struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ bool chained;
+ int old_state;
+ unsigned long flags;
+
+ /* Stop the channel monitor and reset channel's state */
+ NCSI_FOR_EACH_PACKAGE(ndp, np) {
+ NCSI_FOR_EACH_CHANNEL(np, nc) {
+ ncsi_stop_channel_monitor(nc);
+
+ spin_lock_irqsave(&nc->lock, flags);
+ chained = !list_empty(&nc->link);
+ old_state = nc->state;
+ nc->state = NCSI_CHANNEL_INACTIVE;
+ spin_unlock_irqrestore(&nc->lock, flags);
+
+ WARN_ON_ONCE(chained ||
+ old_state == NCSI_CHANNEL_INVISIBLE);
+ }
+ }
+
+ ncsi_report_link(ndp, true);
+}
+EXPORT_SYMBOL_GPL(ncsi_stop_dev);
+
void ncsi_unregister_dev(struct ncsi_dev *nd)
{
struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index af84389..087db77 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -317,12 +317,12 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr)
ncm->data[3] = ntohl(rsp->other);
ncm->data[4] = ntohl(rsp->oem_status);
- if (nr->driven)
+ if (nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN)
return 0;
/* Reset the channel monitor if it has been enabled */
spin_lock_irqsave(&nc->lock, flags);
- nc->timeout = 0;
+ nc->monitor.state = NCSI_CHANNEL_MONITOR_START;
spin_unlock_irqrestore(&nc->lock, flags);
return 0;
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0c858110..c23c3c8 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
# nf_tables
nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o
-nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o
nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+nf_tables-objs += nft_lookup.o nft_dynset.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d..fa6715d 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -22,6 +22,7 @@
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -61,33 +62,55 @@ EXPORT_SYMBOL(nf_hooks_needed);
#endif
static DEFINE_MUTEX(nf_hook_mutex);
+#define nf_entry_dereference(e) \
+ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
-static struct list_head *nf_find_hook_list(struct net *net,
- const struct nf_hook_ops *reg)
+static struct nf_hook_entry *nf_hook_entry_head(struct net *net,
+ const struct nf_hook_ops *reg)
{
- struct list_head *hook_list = NULL;
+ struct nf_hook_entry *hook_head = NULL;
if (reg->pf != NFPROTO_NETDEV)
- hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+ hook_head = nf_entry_dereference(net->nf.hooks[reg->pf]
+ [reg->hooknum]);
else if (reg->hooknum == NF_NETDEV_INGRESS) {
#ifdef CONFIG_NETFILTER_INGRESS
if (reg->dev && dev_net(reg->dev) == net)
- hook_list = &reg->dev->nf_hooks_ingress;
+ hook_head =
+ nf_entry_dereference(
+ reg->dev->nf_hooks_ingress);
#endif
}
- return hook_list;
+ return hook_head;
}
-struct nf_hook_entry {
- const struct nf_hook_ops *orig_ops;
- struct nf_hook_ops ops;
-};
+/* must hold nf_hook_mutex */
+static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg,
+ struct nf_hook_entry *entry)
+{
+ switch (reg->pf) {
+ case NFPROTO_NETDEV:
+ /* We already checked in nf_register_net_hook() that this is
+ * used from ingress.
+ */
+ rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry);
+ break;
+ default:
+ rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum],
+ entry);
+ break;
+ }
+}
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
- struct list_head *hook_list;
+ struct nf_hook_entry *hooks_entry;
struct nf_hook_entry *entry;
- struct nf_hook_ops *elem;
+
+ if (reg->pf == NFPROTO_NETDEV &&
+ (reg->hooknum != NF_NETDEV_INGRESS ||
+ !reg->dev || dev_net(reg->dev) != net))
+ return -EINVAL;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
@@ -95,19 +118,30 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
entry->orig_ops = reg;
entry->ops = *reg;
+ entry->next = NULL;
+
+ mutex_lock(&nf_hook_mutex);
+ hooks_entry = nf_hook_entry_head(net, reg);
- hook_list = nf_find_hook_list(net, reg);
- if (!hook_list) {
- kfree(entry);
- return -ENOENT;
+ if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) {
+ /* This is the case where we need to insert at the head */
+ entry->next = hooks_entry;
+ hooks_entry = NULL;
}
- mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, hook_list, list) {
- if (reg->priority < elem->priority)
- break;
+ while (hooks_entry &&
+ reg->priority >= hooks_entry->orig_ops->priority &&
+ nf_entry_dereference(hooks_entry->next)) {
+ hooks_entry = nf_entry_dereference(hooks_entry->next);
+ }
+
+ if (hooks_entry) {
+ entry->next = nf_entry_dereference(hooks_entry->next);
+ rcu_assign_pointer(hooks_entry->next, entry);
+ } else {
+ nf_set_hooks_head(net, reg, entry);
}
- list_add_rcu(&entry->ops.list, elem->list.prev);
+
mutex_unlock(&nf_hook_mutex);
#ifdef CONFIG_NETFILTER_INGRESS
if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -122,24 +156,33 @@ EXPORT_SYMBOL(nf_register_net_hook);
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
- struct list_head *hook_list;
- struct nf_hook_entry *entry;
- struct nf_hook_ops *elem;
-
- hook_list = nf_find_hook_list(net, reg);
- if (!hook_list)
- return;
+ struct nf_hook_entry *hooks_entry;
mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, hook_list, list) {
- entry = container_of(elem, struct nf_hook_entry, ops);
- if (entry->orig_ops == reg) {
- list_del_rcu(&entry->ops.list);
- break;
+ hooks_entry = nf_hook_entry_head(net, reg);
+ if (hooks_entry->orig_ops == reg) {
+ nf_set_hooks_head(net, reg,
+ nf_entry_dereference(hooks_entry->next));
+ goto unlock;
+ }
+ while (hooks_entry && nf_entry_dereference(hooks_entry->next)) {
+ struct nf_hook_entry *next =
+ nf_entry_dereference(hooks_entry->next);
+ struct nf_hook_entry *nnext;
+
+ if (next->orig_ops != reg) {
+ hooks_entry = next;
+ continue;
}
+ nnext = nf_entry_dereference(next->next);
+ rcu_assign_pointer(hooks_entry->next, nnext);
+ hooks_entry = next;
+ break;
}
+
+unlock:
mutex_unlock(&nf_hook_mutex);
- if (&elem->list == hook_list) {
+ if (!hooks_entry) {
WARN(1, "nf_unregister_net_hook: hook not found!\n");
return;
}
@@ -151,10 +194,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
synchronize_net();
- nf_queue_nf_hook_drop(net, &entry->ops);
+ nf_queue_nf_hook_drop(net, hooks_entry);
/* other cpu might still process nfqueue verdict that used reg */
synchronize_net();
- kfree(entry);
+ kfree(hooks_entry);
}
EXPORT_SYMBOL(nf_unregister_net_hook);
@@ -188,19 +231,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
static LIST_HEAD(nf_hook_list);
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
{
struct net *net, *last;
int ret;
- rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(&reg->list, &nf_hook_list);
- rtnl_unlock();
return 0;
rollback:
@@ -210,19 +251,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+ return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
}
EXPORT_SYMBOL(nf_register_hook);
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
{
struct net *net;
- rtnl_lock();
list_del(&reg->list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ rtnl_lock();
+ _nf_unregister_hook(reg);
rtnl_unlock();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +302,26 @@ err:
}
EXPORT_SYMBOL(nf_register_hooks);
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = _nf_register_hook(&reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ _nf_unregister_hooks(reg, i);
+ return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
{
while (n-- > 0)
@@ -253,10 +329,17 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
}
EXPORT_SYMBOL(nf_unregister_hooks);
-unsigned int nf_iterate(struct list_head *head,
- struct sk_buff *skb,
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ while (n-- > 0)
+ _nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
+unsigned int nf_iterate(struct sk_buff *skb,
struct nf_hook_state *state,
- struct nf_hook_ops **elemp)
+ struct nf_hook_entry **entryp)
{
unsigned int verdict;
@@ -264,20 +347,23 @@ unsigned int nf_iterate(struct list_head *head,
* The caller must not block between calls to this
* function because of risk of continuing from deleted element.
*/
- list_for_each_entry_continue_rcu((*elemp), head, list) {
- if (state->thresh > (*elemp)->priority)
+ while (*entryp) {
+ if (state->thresh > (*entryp)->ops.priority) {
+ *entryp = rcu_dereference((*entryp)->next);
continue;
+ }
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
repeat:
- verdict = (*elemp)->hook((*elemp)->priv, skb, state);
+ verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
> NF_MAX_VERDICT)) {
NFDEBUG("Evil return from %p(%u).\n",
- (*elemp)->hook, state->hook);
+ (*entryp)->ops.hook, state->hook);
+ *entryp = rcu_dereference((*entryp)->next);
continue;
}
#endif
@@ -285,25 +371,23 @@ repeat:
return verdict;
goto repeat;
}
+ *entryp = rcu_dereference((*entryp)->next);
}
return NF_ACCEPT;
}
/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
+ * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
{
- struct nf_hook_ops *elem;
+ struct nf_hook_entry *entry;
unsigned int verdict;
int ret = 0;
- /* We may already have this, but read-locks nest anyway */
- rcu_read_lock();
-
- elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
+ entry = rcu_dereference(state->hook_entries);
next_hook:
- verdict = nf_iterate(state->hook_list, skb, state, &elem);
+ verdict = nf_iterate(skb, state, &entry);
if (verdict == NF_ACCEPT || verdict == NF_STOP) {
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
@@ -312,8 +396,10 @@ next_hook:
if (ret == 0)
ret = -EPERM;
} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- int err = nf_queue(skb, elem, state,
- verdict >> NF_VERDICT_QBITS);
+ int err;
+
+ RCU_INIT_POINTER(state->hook_entries, entry);
+ err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
if (err < 0) {
if (err == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
@@ -321,7 +407,6 @@ next_hook:
kfree_skb(skb);
}
}
- rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(nf_hook_slow);
@@ -441,7 +526,7 @@ static int __net_init netfilter_net_init(struct net *net)
for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+ RCU_INIT_POINTER(net->nf.hooks[i][h], NULL);
}
#ifdef CONFIG_PROC_FS
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac1db40..ba6a1d4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -379,7 +379,6 @@ static void
destroy_conntrack(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- struct net *net = nf_ct_net(ct);
struct nf_conntrack_l4proto *l4proto;
pr_debug("destroy_conntrack(%p)\n", ct);
@@ -406,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- NF_CT_STAT_INC(net, delete);
local_bh_enable();
if (ct->master)
@@ -438,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
nf_ct_add_to_dying_list(ct);
- NF_CT_STAT_INC(net, delete_list);
local_bh_enable();
}
@@ -529,11 +526,8 @@ begin:
if (nf_ct_is_dying(ct))
continue;
- if (nf_ct_key_equal(h, tuple, zone, net)) {
- NF_CT_STAT_INC_ATOMIC(net, found);
+ if (nf_ct_key_equal(h, tuple, zone, net))
return h;
- }
- NF_CT_STAT_INC_ATOMIC(net, searched);
}
/*
* if the nulls value we got at the end of this lookup is
@@ -798,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert);
local_bh_enable();
help = nfct_help(ct);
@@ -857,7 +850,6 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
rcu_read_unlock();
return 1;
}
- NF_CT_STAT_INC_ATOMIC(net, searched);
}
if (get_nulls_value(n) != hash) {
@@ -1116,9 +1108,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
- if (tmpl && nfct_synproxy(tmpl)) {
- nfct_seqadj_ext_add(ct);
- nfct_synproxy_ext_add(ct);
+ if (!nf_ct_add_synproxy(ct, tmpl)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
@@ -1177,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
spin_unlock(&nf_conntrack_expect_lock);
}
- if (!exp) {
+ if (!exp)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- NF_CT_STAT_INC(net, new);
- }
/* Now it is inserted into the unconfirmed list, bump refcount */
nf_conntrack_get(&ct->ct_general);
@@ -1285,7 +1275,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
skb->nfct = NULL;
}
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
l3proto = __nf_ct_l3proto_find(pf);
ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
&dataoff, &protonum);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index b6934b5..e3ed200 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -301,8 +301,6 @@ static int find_pattern(const char *data, size_t dlen,
size_t i = plen;
pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
- if (dlen == 0)
- return 0;
if (dlen <= plen) {
/* Short packet: try for partial? */
@@ -311,19 +309,8 @@ static int find_pattern(const char *data, size_t dlen,
else return 0;
}
- if (strncasecmp(data, pattern, plen) != 0) {
-#if 0
- size_t i;
-
- pr_debug("ftp: string mismatch\n");
- for (i = 0; i < plen; i++) {
- pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
- i, data[i], data[i],
- pattern[i], pattern[i]);
- }
-#endif
+ if (strncasecmp(data, pattern, plen) != 0)
return 0;
- }
pr_debug("Pattern matches!\n");
/* Now we've found the constant string, try to skip
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5c0db5c..f65d9363 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -736,7 +736,7 @@ static int callforward_do_filter(struct net *net,
const struct nf_afinfo *afinfo;
int ret = 0;
- /* rcu_read_lock()ed by nf_hook_slow() */
+ /* rcu_read_lock()ed by nf_hook_thresh */
afinfo = nf_get_afinfo(family);
if (!afinfo)
return 0;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index b989b81..336e215 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -189,7 +189,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
struct net *net = nf_ct_net(ct);
- int ret = 0;
/* We already got a helper explicitly attached. The function
* nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -223,15 +222,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
if (helper == NULL) {
if (help)
RCU_INIT_POINTER(help->helper, NULL);
- goto out;
+ return 0;
}
if (help == NULL) {
help = nf_ct_helper_ext_add(ct, helper, flags);
- if (help == NULL) {
- ret = -ENOMEM;
- goto out;
- }
+ if (help == NULL)
+ return -ENOMEM;
} else {
/* We only allow helper re-assignment of the same sort since
* we cannot reallocate the helper extension area.
@@ -240,13 +237,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
if (tmp && tmp->help != helper->help) {
RCU_INIT_POINTER(help->helper, NULL);
- goto out;
+ return 0;
}
}
rcu_assign_pointer(help->helper, helper);
-out:
- return ret;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
@@ -349,7 +346,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
/* Called from the helper function, this call never fails */
help = nfct_help(ct);
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c052b71..2754045 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1984,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(cpu);
- if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
- nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
- nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+ if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
- nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
- nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a96451a..9a715f8 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct net *net, struct nf_conntrack_tuple *tuple)
{
- const struct gre_hdr_pptp *pgrehdr;
- struct gre_hdr_pptp _pgrehdr;
+ const struct pptp_gre_header *pgrehdr;
+ struct pptp_gre_header _pgrehdr;
__be16 srckey;
- const struct gre_hdr *grehdr;
- struct gre_hdr _grehdr;
+ const struct gre_base_hdr *grehdr;
+ struct gre_base_hdr _grehdr;
/* first only delinearize old RFC1701 GRE header */
grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
- if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+ if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) {
/* try to behave like "nf_conntrack_proto_generic" */
tuple->src.u.all = 0;
tuple->dst.u.all = 0;
@@ -212,8 +212,8 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
if (!pgrehdr)
return true;
- if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
- pr_debug("GRE_VERSION_PPTP but unknown proto\n");
+ if (grehdr->protocol != GRE_PROTO_PPP) {
+ pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol));
return false;
}
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index dff0f0c..ef7063e 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -169,7 +169,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
s32 seqoff, ackoff;
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
struct nf_ct_seqadj *this_way, *other_way;
- int res;
+ int res = 1;
this_way = &seqadj->seq[dir];
other_way = &seqadj->seq[!dir];
@@ -184,27 +184,31 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
else
seqoff = this_way->offset_before;
+ newseq = htonl(ntohl(tcph->seq) + seqoff);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
+ pr_debug("Adjusting sequence number from %u->%u\n",
+ ntohl(tcph->seq), ntohl(newseq));
+ tcph->seq = newseq;
+
+ if (!tcph->ack)
+ goto out;
+
if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
other_way->correction_pos))
ackoff = other_way->offset_after;
else
ackoff = other_way->offset_before;
- newseq = htonl(ntohl(tcph->seq) + seqoff);
newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
- inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
false);
-
- pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+ pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n",
ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
ntohl(newack));
-
- tcph->seq = newseq;
tcph->ack_seq = newack;
res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+out:
spin_unlock_bh(&ct->lock);
return res;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 7d77217..621b81c 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -83,9 +83,10 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
static int iswordc(const char c)
{
if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
- (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ (c >= '(' && c <= '+') || c == ':' || c == '<' || c == '>' ||
c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
- c == '{' || c == '}' || c == '~')
+ c == '{' || c == '}' || c == '~' || (c >= '-' && c <= '/') ||
+ c == '\'')
return 1;
return 0;
}
@@ -329,13 +330,12 @@ static const char *sip_follow_continuation(const char *dptr, const char *limit)
static const char *sip_skip_whitespace(const char *dptr, const char *limit)
{
for (; dptr < limit; dptr++) {
- if (*dptr == ' ')
+ if (*dptr == ' ' || *dptr == '\t')
continue;
if (*dptr != '\r' && *dptr != '\n')
break;
dptr = sip_follow_continuation(dptr, limit);
- if (dptr == NULL)
- return NULL;
+ break;
}
return dptr;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 3d9a316..5f446cd 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -212,6 +212,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
return 0;
+ if (nf_ct_should_gc(ct)) {
+ nf_ct_kill(ct);
+ goto release;
+ }
+
/* we only want to print DIR_ORIGINAL */
if (NF_CT_DIRECTION(hash))
goto release;
@@ -352,13 +357,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- st->searched,
+ 0,
st->found,
- st->new,
+ 0,
st->invalid,
st->ignore,
- st->delete,
- st->delete_list,
+ 0,
+ 0,
st->insert,
st->insert_failed,
st->drop,
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 0655225..e0adb59 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -13,13 +13,13 @@
/* core.c */
-unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
- struct nf_hook_state *state, struct nf_hook_ops **elemp);
+unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state,
+ struct nf_hook_entry **entryp);
/* nf_queue.c */
-int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
- struct nf_hook_state *state, unsigned int queuenum);
-void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops);
+int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
+ unsigned int queuenum);
+void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry);
int __init netfilter_queue_init(void);
/* nf_log.c */
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index a5aa596..119fe1c 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -77,7 +77,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
nf_log_buf_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & XT_LOG_TCPSEQ) {
+ if (logflags & NF_LOG_TCPSEQ) {
nf_log_buf_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
}
@@ -107,7 +107,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
/* Max length: 11 "URGP=65535 " */
nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
- if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
+ if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
u_int8_t _opt[60 - sizeof(struct tcphdr)];
const u_int8_t *op;
unsigned int i;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 81ae41f..bbb8f3d 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
if (nfct_help(ct))
- nfct_seqadj_ext_add(ct);
+ if (!nfct_seqadj_ext_add(ct))
+ return NF_DROP;
}
if (maniptype == NF_NAT_MANIP_SRC) {
@@ -801,7 +802,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
if (err < 0)
return err;
- return nf_nat_setup_info(ct, &range, manip);
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
}
#else
static int
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b19ad20..96964a0 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
-void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
+void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
{
const struct nf_queue_handler *qh;
rcu_read_lock();
qh = rcu_dereference(net->nf.queue_handler);
if (qh)
- qh->nf_hook_drop(net, ops);
+ qh->nf_hook_drop(net, entry);
rcu_read_unlock();
}
@@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
* through nf_reinject().
*/
int nf_queue(struct sk_buff *skb,
- struct nf_hook_ops *elem,
struct nf_hook_state *state,
unsigned int queuenum)
{
@@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb,
*entry = (struct nf_queue_entry) {
.skb = skb,
- .elem = elem,
.state = *state,
.size = sizeof(*entry) + afinfo->route_key_size,
};
@@ -165,11 +163,15 @@ err:
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
+ struct nf_hook_entry *hook_entry;
struct sk_buff *skb = entry->skb;
- struct nf_hook_ops *elem = entry->elem;
const struct nf_afinfo *afinfo;
+ struct nf_hook_ops *elem;
int err;
+ hook_entry = rcu_dereference(entry->state.hook_entries);
+ elem = &hook_entry->ops;
+
nf_queue_entry_release_refs(entry);
/* Continue traversal iff userspace said ok... */
@@ -186,8 +188,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
if (verdict == NF_ACCEPT) {
next_hook:
- verdict = nf_iterate(entry->state.hook_list,
- skb, &entry->state, &elem);
+ verdict = nf_iterate(skb, &entry->state, &hook_entry);
}
switch (verdict & NF_VERDICT_MASK) {
@@ -198,7 +199,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
local_bh_enable();
break;
case NF_QUEUE:
- err = nf_queue(skb, elem, &entry->state,
+ RCU_INIT_POINTER(entry->state.hook_entries, hook_entry);
+ err = nf_queue(skb, &entry->state,
verdict >> NF_VERDICT_QBITS);
if (err < 0) {
if (err == -ESRCH &&
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bd9715e..b70d3ea 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4410,6 +4410,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
}
/**
+ * nft_parse_u32_check - fetch u32 attribute and check for maximum value
+ *
+ * @attr: netlink attribute to fetch value from
+ * @max: maximum value to be stored in dest
+ * @dest: pointer to the variable
+ *
+ * Parse, check and store a given u32 netlink attribute into variable.
+ * This function returns -ERANGE if the value goes over maximum value.
+ * Otherwise a 0 is returned and the attribute value is stored in the
+ * destination variable.
+ */
+unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
+{
+ int val;
+
+ val = ntohl(nla_get_be32(attr));
+ if (val > max)
+ return -ERANGE;
+
+ *dest = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_u32_check);
+
+/**
* nft_parse_register - parse a register value from a netlink attribute
*
* @attr: netlink attribute
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index fb8b589..0dd5c69 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = {
.u = {
.log = {
.level = LOGLEVEL_WARNING,
- .logflags = NF_LOG_MASK,
+ .logflags = NF_LOG_DEFAULT_MASK,
},
},
};
@@ -93,12 +93,15 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb);
- else
+ else {
+ if (!pkt->tprot_set)
+ return false;
ptr = skb_network_header(skb) + pkt->xt.thoff;
+ }
ptr += priv->offset;
- if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+ if (unlikely(ptr + priv->len > skb_tail_pointer(skb)))
return false;
*dest = 0;
@@ -260,8 +263,13 @@ int __init nf_tables_core_module_init(void)
if (err < 0)
goto err7;
- return 0;
+ err = nft_range_module_init();
+ if (err < 0)
+ goto err8;
+ return 0;
+err8:
+ nft_dynset_module_exit();
err7:
nft_payload_module_exit();
err6:
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 6b5f762..f713cc2 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -82,7 +82,10 @@ static int __init nf_tables_inet_init(void)
{
int ret;
- nft_register_chain_type(&filter_inet);
+ ret = nft_register_chain_type(&filter_inet);
+ if (ret < 0)
+ return ret;
+
ret = register_pernet_subsys(&nf_tables_inet_net_ops);
if (ret < 0)
nft_unregister_chain_type(&filter_inet);
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 75d696f..9e2ae42 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -15,78 +15,6 @@
#include <net/netfilter/nf_tables_ipv4.h>
#include <net/netfilter/nf_tables_ipv6.h>
-static inline void
-nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct iphdr *iph, _iph;
- u32 len, thoff;
-
- nft_set_pktinfo(pkt, skb, state);
-
- iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
- &_iph);
- if (!iph)
- return;
-
- if (iph->ihl < 5 || iph->version != 4)
- return;
-
- len = ntohs(iph->tot_len);
- thoff = iph->ihl * 4;
- if (skb->len < len)
- return;
- else if (len < thoff)
- return;
-
- pkt->tprot = iph->protocol;
- pkt->xt.thoff = thoff;
- pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
-}
-
-static inline void
-__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_IPV6)
- struct ipv6hdr *ip6h, _ip6h;
- unsigned int thoff = 0;
- unsigned short frag_off;
- int protohdr;
- u32 pkt_len;
-
- ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
- &_ip6h);
- if (!ip6h)
- return;
-
- if (ip6h->version != 6)
- return;
-
- pkt_len = ntohs(ip6h->payload_len);
- if (pkt_len + sizeof(*ip6h) > skb->len)
- return;
-
- protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
- if (protohdr < 0)
- return;
-
- pkt->tprot = protohdr;
- pkt->xt.thoff = thoff;
- pkt->xt.fragoff = frag_off;
-#endif
-}
-
-static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- nft_set_pktinfo(pkt, skb, state);
- __nft_netdev_set_pktinfo_ipv6(pkt, skb, state);
-}
-
static unsigned int
nft_do_chain_netdev(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -95,13 +23,13 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
switch (skb->protocol) {
case htons(ETH_P_IP):
- nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
+ nft_set_pktinfo_ipv4_validate(&pkt, skb, state);
break;
case htons(ETH_P_IPV6):
- nft_netdev_set_pktinfo_ipv6(&pkt, skb, state);
+ nft_set_pktinfo_ipv6_validate(&pkt, skb, state);
break;
default:
- nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_unspec(&pkt, skb, state);
break;
}
@@ -221,14 +149,25 @@ static int __init nf_tables_netdev_init(void)
{
int ret;
- nft_register_chain_type(&nft_filter_chain_netdev);
- ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
- if (ret < 0) {
- nft_unregister_chain_type(&nft_filter_chain_netdev);
+ ret = nft_register_chain_type(&nft_filter_chain_netdev);
+ if (ret)
return ret;
- }
- register_netdevice_notifier(&nf_tables_netdev_notifier);
+
+ ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+ if (ret)
+ goto err1;
+
+ ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
+ if (ret)
+ goto err2;
+
return 0;
+
+err2:
+ unregister_pernet_subsys(&nf_tables_netdev_net_ops);
+err1:
+ nft_unregister_chain_type(&nft_filter_chain_netdev);
+ return ret;
}
static void __exit nf_tables_netdev_exit(void)
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 39eb1cc..ab695f8 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -113,20 +113,22 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
const struct sk_buff *skb = pkt->skb;
- unsigned int len = min_t(unsigned int,
- pkt->xt.thoff - skb_network_offset(skb),
- NFT_TRACETYPE_NETWORK_HSIZE);
int off = skb_network_offset(skb);
+ unsigned int len, nh_end;
+ nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
+ len = min_t(unsigned int, nh_end - skb_network_offset(skb),
+ NFT_TRACETYPE_NETWORK_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
return -1;
- len = min_t(unsigned int, skb->len - pkt->xt.thoff,
- NFT_TRACETYPE_TRANSPORT_HSIZE);
-
- if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
- pkt->xt.thoff, len))
- return -1;
+ if (pkt->tprot_set) {
+ len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+ NFT_TRACETYPE_TRANSPORT_HSIZE);
+ if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
+ pkt->xt.thoff, len))
+ return -1;
+ }
if (!skb_mac_header_was_set(skb))
return 0;
@@ -237,7 +239,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
break;
case NFT_TRACETYPE_POLICY:
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
- info->basechain->policy))
+ htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index e924e95..3b79f34 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -43,7 +43,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
if (help == NULL)
return NF_DROP;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(help->helper);
if (helper == NULL)
return NF_DROP;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6577db5..eb086a1 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -442,7 +442,9 @@ __build_packet_message(struct nfnl_log_net *log,
if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
htonl(indev->ifindex)) ||
/* this is the bridge group "brX" */
- /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ /* rcu_read_lock()ed by nf_hook_thresh or
+ * nf_log_packet.
+ */
nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
goto nla_put_failure;
@@ -477,7 +479,9 @@ __build_packet_message(struct nfnl_log_net *log,
if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
htonl(outdev->ifindex)) ||
/* this is the bridge group "brX" */
- /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ /* rcu_read_lock()ed by nf_hook_thresh or
+ * nf_log_packet.
+ */
nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
goto nla_put_failure;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f49f450..af832c5 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -740,7 +740,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
- /* rcu_read_lock()ed by nf_hook_slow() */
+ /* rcu_read_lock()ed by nf_hook_thresh */
queue = instance_lookup(q, queuenum);
if (!queue)
return -ESRCH;
@@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = {
.notifier_call = nfqnl_rcv_dev_event,
};
-static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr)
+static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
{
- return entry->elem == (struct nf_hook_ops *)ops_ptr;
+ return rcu_access_pointer(entry->state.hook_entries) ==
+ (struct nf_hook_entry *)entry_ptr;
}
-static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook)
+static void nfqnl_nf_hook_drop(struct net *net,
+ const struct nf_hook_entry *hook)
{
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int i;
@@ -1522,9 +1524,16 @@ static int __init nfnetlink_queue_init(void)
goto cleanup_netlink_notifier;
}
- register_netdevice_notifier(&nfqnl_dev_notifier);
+ status = register_netdevice_notifier(&nfqnl_dev_notifier);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register netdevice notifier\n");
+ goto cleanup_netlink_subsys;
+ }
+
return status;
+cleanup_netlink_subsys:
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
unregister_pernet_subsys(&nfnl_queue_net_ops);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index d71cc18..31c15ed 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
{
struct nft_bitwise *priv = nft_expr_priv(expr);
struct nft_data_desc d1, d2;
+ u32 len;
int err;
if (tb[NFTA_BITWISE_SREG] == NULL ||
@@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
tb[NFTA_BITWISE_XOR] == NULL)
return -EINVAL;
- priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
+
+ priv->len = len;
+
priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
err = nft_validate_register_load(priv->sreg, priv->len);
if (err < 0)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index b78c28b..ee63d98 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_byteorder *priv = nft_expr_priv(expr);
+ u32 size, len;
int err;
if (tb[NFTA_BYTEORDER_SREG] == NULL ||
@@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+ err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size);
+ if (err < 0)
+ return err;
+
+ priv->size = size;
+
switch (priv->size) {
case 2:
case 4:
@@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
}
priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
- priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
+
+ priv->len = len;
+
err = nft_validate_register_load(priv->sreg, priv->len);
if (err < 0)
return err;
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index e25b35d..2e53739 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -84,6 +84,9 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (err < 0)
return err;
+ if (desc.len > U8_MAX)
+ return -ERANGE;
+
priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
priv->len = desc.len;
return 0;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 51e180f..d7b0d171 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -128,15 +128,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
memcpy(dest, &count, sizeof(count));
return;
}
+ case NFT_CT_L3PROTOCOL:
+ *dest = nf_ct_l3num(ct);
+ return;
+ case NFT_CT_PROTOCOL:
+ *dest = nf_ct_protonum(ct);
+ return;
default:
break;
}
tuple = &ct->tuplehash[priv->dir].tuple;
switch (priv->key) {
- case NFT_CT_L3PROTOCOL:
- *dest = nf_ct_l3num(ct);
- return;
case NFT_CT_SRC:
memcpy(dest, tuple->src.u3.all,
nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
@@ -145,9 +148,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
memcpy(dest, tuple->dst.u3.all,
nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
return;
- case NFT_CT_PROTOCOL:
- *dest = nf_ct_protonum(ct);
- return;
case NFT_CT_PROTO_SRC:
*dest = (__force __u16)tuple->src.u.all;
return;
@@ -283,8 +283,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
case NFT_CT_L3PROTOCOL:
case NFT_CT_PROTOCOL:
- if (tb[NFTA_CT_DIRECTION] == NULL)
- return -EINVAL;
+ /* For compatibility, do not report error if NFTA_CT_DIRECTION
+ * attribute is specified.
+ */
len = sizeof(u8);
break;
case NFT_CT_SRC:
@@ -363,6 +364,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
len = FIELD_SIZEOF(struct nf_conn, mark);
break;
#endif
@@ -432,8 +435,6 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
switch (priv->key) {
- case NFT_CT_L3PROTOCOL:
- case NFT_CT_PROTOCOL:
case NFT_CT_SRC:
case NFT_CT_DST:
case NFT_CT_PROTO_SRC:
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 0af2669..e3b83c3 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -22,6 +22,7 @@ struct nft_dynset {
enum nft_dynset_ops op:8;
enum nft_registers sreg_key:8;
enum nft_registers sreg_data:8;
+ bool invert;
u64 timeout;
struct nft_expr *expr;
struct nft_set_binding binding;
@@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr,
if (sexpr != NULL)
sexpr->ops->eval(sexpr, regs, pkt);
+
+ if (priv->invert)
+ regs->verdict.code = NFT_BREAK;
return;
}
out:
- regs->verdict.code = NFT_BREAK;
+ if (!priv->invert)
+ regs->verdict.code = NFT_BREAK;
}
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
@@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
+ [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
};
static int nft_dynset_init(const struct nft_ctx *ctx,
@@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
tb[NFTA_DYNSET_SREG_KEY] == NULL)
return -EINVAL;
+ if (tb[NFTA_DYNSET_FLAGS]) {
+ u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
+
+ if (flags & ~NFT_DYNSET_F_INV)
+ return -EINVAL;
+ if (flags & NFT_DYNSET_F_INV)
+ priv->invert = true;
+ }
+
set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
genmask);
if (IS_ERR(set)) {
@@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
+ u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
goto nla_put_failure;
@@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 82c264e..a84cf3d 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,7 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_exthdr *priv = nft_expr_priv(expr);
- u32 offset, len;
+ u32 offset, len, err;
if (tb[NFTA_EXTHDR_DREG] == NULL ||
tb[NFTA_EXTHDR_TYPE] == NULL ||
@@ -67,11 +67,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
tb[NFTA_EXTHDR_LEN] == NULL)
return -EINVAL;
- offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
- len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
+ if (err < 0)
+ return err;
- if (offset > U8_MAX || len > U8_MAX)
- return -ERANGE;
+ err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 764251d..09473b4 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -23,6 +23,7 @@ struct nft_hash {
u8 len;
u32 modulus;
u32 seed;
+ u32 offset;
};
static void nft_hash_eval(const struct nft_expr *expr,
@@ -31,10 +32,10 @@ static void nft_hash_eval(const struct nft_expr *expr,
{
struct nft_hash *priv = nft_expr_priv(expr);
const void *data = &regs->data[priv->sreg];
+ u32 h;
- regs->data[priv->dreg] =
- reciprocal_scale(jhash(data, priv->len, priv->seed),
- priv->modulus);
+ h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+ regs->data[priv->dreg] = h + priv->offset;
}
static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
@@ -59,6 +60,9 @@ static int nft_hash_init(const struct nft_ctx *ctx,
!tb[NFTA_HASH_MODULUS])
return -EINVAL;
+ if (tb[NFTA_HASH_OFFSET])
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
+
priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
@@ -72,6 +76,9 @@ static int nft_hash_init(const struct nft_ctx *ctx,
if (priv->modulus <= 1)
return -ERANGE;
+ if (priv->offset + priv->modulus - 1 < priv->offset)
+ return -EOVERFLOW;
+
priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
return nft_validate_register_load(priv->sreg, len) &&
@@ -94,7 +101,9 @@ static int nft_hash_dump(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
goto nla_put_failure;
-
+ if (priv->offset != 0)
+ if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index db3b746..d17018f 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -53,6 +53,10 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
tb[NFTA_IMMEDIATE_DATA]);
if (err < 0)
return err;
+
+ if (desc.len > U8_MAX)
+ return -ERANGE;
+
priv->dlen = desc.len;
priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 24a73bb..1b01404 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx,
if (tb[NFTA_LOG_LEVEL] != NULL &&
tb[NFTA_LOG_GROUP] != NULL)
return -EINVAL;
- if (tb[NFTA_LOG_GROUP] != NULL)
+ if (tb[NFTA_LOG_GROUP] != NULL) {
li->type = NF_LOG_TYPE_ULOG;
+ if (tb[NFTA_LOG_FLAGS] != NULL)
+ return -EINVAL;
+ }
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
@@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
if (tb[NFTA_LOG_FLAGS] != NULL) {
li->u.log.logflags =
ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
+ if (li->u.log.logflags & ~NF_LOG_MASK) {
+ err = -EINVAL;
+ goto err1;
+ }
}
break;
case NF_LOG_TYPE_ULOG:
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index e164325..8166b69 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -43,7 +43,7 @@ static void nft_lookup_eval(const struct nft_expr *expr,
return;
}
- if (found && set->flags & NFT_SET_MAP)
+ if (set->flags & NFT_SET_MAP)
nft_data_copy(&regs->data[priv->dreg],
nft_set_ext_data(ext), set->dlen);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8a6bc76..6c1e024 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -52,6 +52,8 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*dest = pkt->pf;
break;
case NFT_META_L4PROTO:
+ if (!pkt->tprot_set)
+ goto err;
*dest = pkt->tprot;
break;
case NFT_META_PRIORITY:
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 294745e..55bc5ab 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -21,8 +21,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
struct nft_ng_inc {
enum nft_registers dreg:8;
- u32 until;
+ u32 modulus;
atomic_t counter;
+ u32 offset;
};
static void nft_ng_inc_eval(const struct nft_expr *expr,
@@ -34,16 +35,17 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
do {
oval = atomic_read(&priv->counter);
- nval = (oval + 1 < priv->until) ? oval + 1 : 0;
+ nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
- memcpy(&regs->data[priv->dreg], &priv->counter, sizeof(u32));
+ regs->data[priv->dreg] = nval + priv->offset;
}
static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
[NFTA_NG_DREG] = { .type = NLA_U32 },
- [NFTA_NG_UNTIL] = { .type = NLA_U32 },
+ [NFTA_NG_MODULUS] = { .type = NLA_U32 },
[NFTA_NG_TYPE] = { .type = NLA_U32 },
+ [NFTA_NG_OFFSET] = { .type = NLA_U32 },
};
static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -52,10 +54,16 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
{
struct nft_ng_inc *priv = nft_expr_priv(expr);
- priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
- if (priv->until == 0)
+ if (tb[NFTA_NG_OFFSET])
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
+ priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+ if (priv->modulus == 0)
return -ERANGE;
+ if (priv->offset + priv->modulus - 1 < priv->offset)
+ return -EOVERFLOW;
+
priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
atomic_set(&priv->counter, 0);
@@ -64,14 +72,16 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
}
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
- u32 until, enum nft_ng_types type)
+ u32 modulus, enum nft_ng_types type, u32 offset)
{
if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_NG_UNTIL, htonl(until)))
+ if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus)))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset)))
+ goto nla_put_failure;
return 0;
@@ -83,12 +93,14 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ng_inc *priv = nft_expr_priv(expr);
- return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL);
+ return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL,
+ priv->offset);
}
struct nft_ng_random {
enum nft_registers dreg:8;
- u32 until;
+ u32 modulus;
+ u32 offset;
};
static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -97,9 +109,10 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
{
struct nft_ng_random *priv = nft_expr_priv(expr);
struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+ u32 val;
- regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
- priv->until);
+ val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
+ regs->data[priv->dreg] = val + priv->offset;
}
static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -108,10 +121,16 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
{
struct nft_ng_random *priv = nft_expr_priv(expr);
- priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
- if (priv->until == 0)
+ if (tb[NFTA_NG_OFFSET])
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
+ priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+ if (priv->modulus == 0)
return -ERANGE;
+ if (priv->offset + priv->modulus - 1 < priv->offset)
+ return -EOVERFLOW;
+
prandom_init_once(&nft_numgen_prandom_state);
priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
@@ -124,7 +143,8 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
- return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM);
+ return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM,
+ priv->offset);
}
static struct nft_expr_type nft_ng_type;
@@ -149,8 +169,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
u32 type;
- if (!tb[NFTA_NG_DREG] ||
- !tb[NFTA_NG_UNTIL] ||
+ if (!tb[NFTA_NG_DREG] ||
+ !tb[NFTA_NG_MODULUS] ||
!tb[NFTA_NG_TYPE])
return ERR_PTR(-EINVAL);
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 12cd4bf..b2f8861 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -92,6 +92,8 @@ static void nft_payload_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!pkt->tprot_set)
+ goto err;
offset = pkt->xt.thoff;
break;
default:
@@ -184,6 +186,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!pkt->tprot_set)
+ goto err;
offset = pkt->xt.thoff;
break;
default:
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 61d216e..393d359 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -22,9 +22,10 @@
static u32 jhash_initval __read_mostly;
struct nft_queue {
- u16 queuenum;
- u16 queues_total;
- u16 flags;
+ enum nft_registers sreg_qnum:8;
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
};
static void nft_queue_eval(const struct nft_expr *expr,
@@ -54,31 +55,78 @@ static void nft_queue_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static void nft_queue_sreg_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+ u32 queue, ret;
+
+ queue = regs->data[priv->sreg_qnum];
+
+ ret = NF_QUEUE_NR(queue);
+ if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ regs->verdict.code = ret;
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
[NFTA_QUEUE_FLAGS] = { .type = NLA_U16 },
+ [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 },
};
static int nft_queue_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_queue *priv = nft_expr_priv(expr);
+ u32 maxid;
- if (tb[NFTA_QUEUE_NUM] == NULL)
- return -EINVAL;
-
- init_hashrandom(&jhash_initval);
priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
- if (tb[NFTA_QUEUE_TOTAL] != NULL)
+ if (tb[NFTA_QUEUE_TOTAL])
priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
- if (tb[NFTA_QUEUE_FLAGS] != NULL) {
+ else
+ priv->queues_total = 1;
+
+ if (priv->queues_total == 0)
+ return -EINVAL;
+
+ maxid = priv->queues_total - 1 + priv->queuenum;
+ if (maxid > U16_MAX)
+ return -ERANGE;
+
+ if (tb[NFTA_QUEUE_FLAGS]) {
+ priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
+ if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nft_queue_sreg_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
+ err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_QUEUE_FLAGS]) {
priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
return -EINVAL;
+ if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT)
+ return -EOPNOTSUPP;
}
+
return 0;
}
@@ -97,6 +145,21 @@ nla_put_failure:
return -1;
}
+static int
+nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_queue *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) ||
+ nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct nft_expr_type nft_queue_type;
static const struct nft_expr_ops nft_queue_ops = {
.type = &nft_queue_type,
@@ -106,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = {
.dump = nft_queue_dump,
};
+static const struct nft_expr_ops nft_queue_sreg_ops = {
+ .type = &nft_queue_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)),
+ .eval = nft_queue_sreg_eval,
+ .init = nft_queue_sreg_init,
+ .dump = nft_queue_sreg_dump,
+};
+
+static const struct nft_expr_ops *
+nft_queue_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM])
+ return ERR_PTR(-EINVAL);
+
+ init_hashrandom(&jhash_initval);
+
+ if (tb[NFTA_QUEUE_NUM])
+ return &nft_queue_ops;
+
+ if (tb[NFTA_QUEUE_SREG_QNUM])
+ return &nft_queue_sreg_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
static struct nft_expr_type nft_queue_type __read_mostly = {
.name = "queue",
- .ops = &nft_queue_ops,
+ .select_ops = &nft_queue_select_ops,
.policy = nft_queue_policy,
.maxattr = NFTA_QUEUE_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 6eafbf9..c00104c 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -21,10 +21,10 @@ struct nft_quota {
atomic64_t remain;
};
-static inline long nft_quota(struct nft_quota *priv,
- const struct nft_pktinfo *pkt)
+static inline bool nft_overquota(struct nft_quota *priv,
+ const struct nft_pktinfo *pkt)
{
- return atomic64_sub_return(pkt->skb->len, &priv->remain);
+ return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
}
static void nft_quota_eval(const struct nft_expr *expr,
@@ -33,7 +33,7 @@ static void nft_quota_eval(const struct nft_expr *expr,
{
struct nft_quota *priv = nft_expr_priv(expr);
- if (nft_quota(priv, pkt) < 0 && !priv->invert)
+ if (nft_overquota(priv, pkt) ^ priv->invert)
regs->verdict.code = NFT_BREAK;
}
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
new file mode 100644
index 0000000..c6d5358
--- /dev/null
+++ b/net/netfilter/nft_range.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_range_expr {
+ struct nft_data data_from;
+ struct nft_data data_to;
+ enum nft_registers sreg:8;
+ u8 len;
+ enum nft_range_ops op:8;
+};
+
+static void nft_range_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_range_expr *priv = nft_expr_priv(expr);
+ bool mismatch;
+ int d1, d2;
+
+ d1 = memcmp(&regs->data[priv->sreg], &priv->data_from, priv->len);
+ d2 = memcmp(&regs->data[priv->sreg], &priv->data_to, priv->len);
+ switch (priv->op) {
+ case NFT_RANGE_EQ:
+ mismatch = (d1 < 0 || d2 > 0);
+ break;
+ case NFT_RANGE_NEQ:
+ mismatch = (d1 >= 0 && d2 <= 0);
+ break;
+ }
+
+ if (mismatch)
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
+ [NFTA_RANGE_SREG] = { .type = NLA_U32 },
+ [NFTA_RANGE_OP] = { .type = NLA_U32 },
+ [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED },
+ [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_range_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc_from, desc_to;
+ int err;
+
+ err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
+ &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
+ &desc_to, tb[NFTA_RANGE_TO_DATA]);
+ if (err < 0)
+ goto err1;
+
+ if (desc_from.len != desc_to.len) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
+ err = nft_validate_register_load(priv->sreg, desc_from.len);
+ if (err < 0)
+ goto err2;
+
+ priv->op = ntohl(nla_get_be32(tb[NFTA_RANGE_OP]));
+ priv->len = desc_from.len;
+ return 0;
+err2:
+ nft_data_uninit(&priv->data_to, desc_to.type);
+err1:
+ nft_data_uninit(&priv->data_from, desc_from.type);
+ return err;
+}
+
+static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_range_expr *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from,
+ NFT_DATA_VALUE, priv->len) < 0 ||
+ nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_range_type;
+static const struct nft_expr_ops nft_range_ops = {
+ .type = &nft_range_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
+ .eval = nft_range_eval,
+ .init = nft_range_init,
+ .dump = nft_range_dump,
+};
+
+static struct nft_expr_type nft_range_type __read_mostly = {
+ .name = "range",
+ .ops = &nft_range_ops,
+ .policy = nft_range_policy,
+ .maxattr = NFTA_RANGE_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_range_module_init(void)
+{
+ return nft_register_expr(&nft_range_type);
+}
+
+void nft_range_module_exit(void)
+{
+ nft_unregister_expr(&nft_range_type);
+}
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 515131f..dbd6c4a 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -24,7 +24,6 @@ static DEFINE_MUTEX(xt_rateest_mutex);
#define RATEEST_HSIZE 16
static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
static unsigned int jhash_rnd __read_mostly;
-static bool rnd_inited __read_mostly;
static unsigned int xt_rateest_hash(const char *name)
{
@@ -99,10 +98,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
} cfg;
int ret;
- if (unlikely(!rnd_inited)) {
- get_random_bytes(&jhash_rnd, sizeof(jhash_rnd));
- rnd_inited = true;
- }
+ net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
est = xt_rateest_lookup(info->name);
if (est) {
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index e118397..872db2d 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -110,18 +110,14 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
struct net *net = par->net;
unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
+ unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
- if (dst_mtu(skb_dst(skb)) <= minlen) {
+ if (min_mtu <= minlen) {
net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
- dst_mtu(skb_dst(skb)));
+ min_mtu);
return -1;
}
- if (in_mtu <= minlen) {
- net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
- in_mtu);
- return -1;
- }
- newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
+ newmss = min_mtu - minlen;
} else
newmss = info->mss;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 6e57a39..0471db4 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -89,6 +89,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->oif[0]) {
+ int ret;
+
if (info->oif[sizeof(info->oif)-1] != '\0')
return -EINVAL;
@@ -101,7 +103,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
priv->notifier.notifier_call = tee_netdev_event;
info->priv = priv;
- register_netdevice_notifier(&priv->notifier);
+ ret = register_netdevice_notifier(&priv->notifier);
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
} else
info->priv = NULL;
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 99bbc82..b6dc322 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -366,14 +366,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
unsigned int i;
int ret;
- if (unlikely(!connlimit_rnd)) {
- u_int32_t rand;
+ net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
- do {
- get_random_bytes(&rand, sizeof(rand));
- } while (!rand);
- cmpxchg(&connlimit_rnd, 0, rand);
- }
ret = nf_ct_l3proto_try_module_get(par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for "
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 1786968..44a095e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,6 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
}
/* need to declare this at the top */
+static const struct file_operations dl_file_ops_v1;
static const struct file_operations dl_file_ops;
/* hash table crap */
@@ -86,8 +87,8 @@ struct dsthash_ent {
unsigned long expires; /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
- u_int32_t credit;
- u_int32_t credit_cap, cost;
+ u_int64_t credit;
+ u_int64_t credit_cap, cost;
} rateinfo;
struct rcu_head rcu;
};
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
u_int8_t family;
bool rnd_initialized;
- struct hashlimit_cfg1 cfg; /* config */
+ struct hashlimit_cfg2 cfg; /* config */
/* used internally */
spinlock_t lock; /* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
struct hlist_head hash[0]; /* hashtable itself */
};
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+ if (revision == 1) {
+ struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+ to->mode = cfg->mode;
+ to->avg = cfg->avg;
+ to->burst = cfg->burst;
+ to->size = cfg->size;
+ to->max = cfg->max;
+ to->gc_interval = cfg->gc_interval;
+ to->expire = cfg->expire;
+ to->srcmask = cfg->srcmask;
+ to->dstmask = cfg->dstmask;
+ } else if (revision == 2) {
+ memcpy(to, from, sizeof(struct hashlimit_cfg2));
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */
static struct kmem_cache *hashlimit_cachep __read_mostly;
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
}
static void htable_gc(struct work_struct *work);
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
- u_int8_t family)
+static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+ const char *name, u_int8_t family,
+ struct xt_hashlimit_htable **out_hinfo,
+ int revision)
{
struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
struct xt_hashlimit_htable *hinfo;
- unsigned int size;
- unsigned int i;
+ unsigned int size, i;
+ int ret;
- if (minfo->cfg.size) {
- size = minfo->cfg.size;
+ if (cfg->size) {
+ size = cfg->size;
} else {
size = (totalram_pages << PAGE_SHIFT) / 16384 /
sizeof(struct list_head);
@@ -238,10 +265,14 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
sizeof(struct list_head) * size);
if (hinfo == NULL)
return -ENOMEM;
- minfo->hinfo = hinfo;
+ *out_hinfo = hinfo;
/* copy match config into hashtable config */
- memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+
+ if (ret)
+ return ret;
+
hinfo->cfg.size = size;
if (hinfo->cfg.max == 0)
hinfo->cfg.max = 8 * hinfo->cfg.size;
@@ -255,17 +286,18 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
- hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+ hinfo->name = kstrdup(name, GFP_KERNEL);
if (!hinfo->name) {
vfree(hinfo);
return -ENOMEM;
}
spin_lock_init(&hinfo->lock);
- hinfo->pde = proc_create_data(minfo->name, 0,
+ hinfo->pde = proc_create_data(name, 0,
(family == NFPROTO_IPV4) ?
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
- &dl_file_ops, hinfo);
+ (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
+ hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
vfree(hinfo);
@@ -398,7 +430,8 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
(slowest userspace tool allows), which means
CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
*/
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24))
/* Repeated shift and or gives us all 1s, final shift and add 1 gives
* us the power of 2 below the theoretical max, so GCC simply does a
@@ -408,9 +441,12 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32))
#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1)
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
/* in byte mode, the lowest possible rate is one packet/second.
* credit_cap is used as a counter that tells us how many times we can
@@ -425,14 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
}
/* Precision saver. */
-static u32 user2credits(u32 user)
+static u64 user2credits(u64 user, int revision)
{
- /* If multiplying would overflow... */
- if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
- /* Divide first. */
- return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+ if (revision == 1) {
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
+ /* Divide first. */
+ return (user / XT_HASHLIMIT_SCALE) *\
+ HZ * CREDITS_PER_JIFFY_v1;
+
+ return (user * HZ * CREDITS_PER_JIFFY_v1) \
+ / XT_HASHLIMIT_SCALE;
+ } else {
+ if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ return (user / XT_HASHLIMIT_SCALE_v2) *\
+ HZ * CREDITS_PER_JIFFY;
- return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+ return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2;
+ }
}
static u32 user2credits_byte(u32 user)
@@ -442,10 +488,11 @@ static u32 user2credits_byte(u32 user)
return (u32) (us >> 32);
}
-static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+ u32 mode, int revision)
{
unsigned long delta = now - dh->rateinfo.prev;
- u32 cap;
+ u64 cap, cpj;
if (delta == 0)
return;
@@ -453,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
dh->rateinfo.prev = now;
if (mode & XT_HASHLIMIT_BYTES) {
- u32 tmp = dh->rateinfo.credit;
+ u64 tmp = dh->rateinfo.credit;
dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
cap = CREDITS_PER_JIFFY_BYTES * HZ;
if (tmp >= dh->rateinfo.credit) {/* overflow */
@@ -461,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
return;
}
} else {
- dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+ cpj = (revision == 1) ?
+ CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
+ dh->rateinfo.credit += delta * cpj;
cap = dh->rateinfo.credit_cap;
}
if (dh->rateinfo.credit > cap)
@@ -469,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
}
static void rateinfo_init(struct dsthash_ent *dh,
- struct xt_hashlimit_htable *hinfo)
+ struct xt_hashlimit_htable *hinfo, int revision)
{
dh->rateinfo.prev = jiffies;
if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
@@ -478,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
dh->rateinfo.credit_cap = hinfo->cfg.burst;
} else {
dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ hinfo->cfg.burst, revision);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision);
dh->rateinfo.credit_cap = dh->rateinfo.credit;
}
}
@@ -603,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
}
static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
+ struct xt_hashlimit_htable *hinfo,
+ const struct hashlimit_cfg2 *cfg, int revision)
{
- const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
- struct xt_hashlimit_htable *hinfo = info->hinfo;
unsigned long now = jiffies;
struct dsthash_ent *dh;
struct dsthash_dst dst;
bool race = false;
- u32 cost;
+ u64 cost;
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
@@ -626,18 +675,18 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
} else if (race) {
/* Already got an entry, update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
} else {
dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_init(dh, hinfo);
+ rateinfo_init(dh, hinfo, revision);
}
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
}
- if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+ if (cfg->mode & XT_HASHLIMIT_BYTES)
cost = hashlimit_byte_cost(skb->len, dh);
else
cost = dh->rateinfo.cost;
@@ -647,84 +696,157 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
dh->rateinfo.credit -= cost;
spin_unlock(&dh->lock);
rcu_read_unlock_bh();
- return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+ return !(cfg->mode & XT_HASHLIMIT_INVERT);
}
spin_unlock(&dh->lock);
rcu_read_unlock_bh();
/* default match is underlimit - so over the limit, we need to invert */
- return info->cfg.mode & XT_HASHLIMIT_INVERT;
+ return cfg->mode & XT_HASHLIMIT_INVERT;
hotdrop:
par->hotdrop = true;
return false;
}
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static bool
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+ struct xt_hashlimit_htable *hinfo = info->hinfo;
+ struct hashlimit_cfg2 cfg = {};
+ int ret;
+
+ ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+ if (ret)
+ return ret;
+
+ return hashlimit_mt_common(skb, par, hinfo, &cfg, 1);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+ struct xt_hashlimit_htable *hinfo = info->hinfo;
+
+ return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+}
+
+static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
+ struct xt_hashlimit_htable **hinfo,
+ struct hashlimit_cfg2 *cfg,
+ const char *name, int revision)
{
struct net *net = par->net;
- struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
int ret;
- if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
- return -EINVAL;
- if (info->name[sizeof(info->name)-1] != '\0')
+ if (cfg->gc_interval == 0 || cfg->expire == 0)
return -EINVAL;
if (par->family == NFPROTO_IPV4) {
- if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+ if (cfg->srcmask > 32 || cfg->dstmask > 32)
return -EINVAL;
} else {
- if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+ if (cfg->srcmask > 128 || cfg->dstmask > 128)
return -EINVAL;
}
- if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+ if (cfg->mode & ~XT_HASHLIMIT_ALL) {
pr_info("Unknown mode mask %X, kernel too old?\n",
- info->cfg.mode);
+ cfg->mode);
return -EINVAL;
}
/* Check for overflow. */
- if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
- if (user2credits_byte(info->cfg.avg) == 0) {
- pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+ if (cfg->mode & XT_HASHLIMIT_BYTES) {
+ if (user2credits_byte(cfg->avg) == 0) {
+ pr_info("overflow, rate too high: %llu\n", cfg->avg);
return -EINVAL;
}
- } else if (info->cfg.burst == 0 ||
- user2credits(info->cfg.avg * info->cfg.burst) <
- user2credits(info->cfg.avg)) {
- pr_info("overflow, try lower: %u/%u\n",
- info->cfg.avg, info->cfg.burst);
+ } else if (cfg->burst == 0 ||
+ user2credits(cfg->avg * cfg->burst, revision) <
+ user2credits(cfg->avg, revision)) {
+ pr_info("overflow, try lower: %llu/%llu\n",
+ cfg->avg, cfg->burst);
return -ERANGE;
}
mutex_lock(&hashlimit_mutex);
- info->hinfo = htable_find_get(net, info->name, par->family);
- if (info->hinfo == NULL) {
- ret = htable_create(net, info, par->family);
+ *hinfo = htable_find_get(net, name, par->family);
+ if (*hinfo == NULL) {
+ ret = htable_create(net, cfg, name, par->family,
+ hinfo, revision);
if (ret < 0) {
mutex_unlock(&hashlimit_mutex);
return ret;
}
}
mutex_unlock(&hashlimit_mutex);
+
return 0;
}
-static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+ struct hashlimit_cfg2 cfg = {};
+ int ret;
+
+ if (info->name[sizeof(info->name) - 1] != '\0')
+ return -EINVAL;
+
+ ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+ if (ret)
+ return ret;
+
+ return hashlimit_mt_check_common(par, &info->hinfo,
+ &cfg, info->name, 1);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+ if (info->name[sizeof(info->name) - 1] != '\0')
+ return -EINVAL;
+
+ return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
+ info->name, 2);
+}
+
+static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
{
const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
htable_put(info->hinfo);
}
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+ htable_put(info->hinfo);
+}
+
static struct xt_match hashlimit_mt_reg[] __read_mostly = {
{
.name = "hashlimit",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = hashlimit_mt,
+ .match = hashlimit_mt_v1,
.matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .checkentry = hashlimit_mt_check_v1,
+ .destroy = hashlimit_mt_destroy_v1,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "hashlimit",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .match = hashlimit_mt,
+ .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
.checkentry = hashlimit_mt_check,
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
@@ -734,8 +856,18 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
.name = "hashlimit",
.revision = 1,
.family = NFPROTO_IPV6,
- .match = hashlimit_mt,
+ .match = hashlimit_mt_v1,
.matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .checkentry = hashlimit_mt_check_v1,
+ .destroy = hashlimit_mt_destroy_v1,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "hashlimit",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .match = hashlimit_mt,
+ .matchsize = sizeof(struct xt_hashlimit_mtinfo2),
.checkentry = hashlimit_mt_check,
.destroy = hashlimit_mt_destroy,
.me = THIS_MODULE,
@@ -786,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v)
spin_unlock_bh(&htable->lock);
}
-static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
- struct seq_file *s)
+static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
{
- const struct xt_hashlimit_htable *ht = s->private;
-
- spin_lock(&ent->lock);
- /* recalculate to show accurate numbers */
- rateinfo_recalc(ent, jiffies, ht->cfg.mode);
-
switch (family) {
case NFPROTO_IPV4:
- seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+ seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip.src,
ntohs(ent->dst.src_port),
@@ -808,7 +934,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
break;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
case NFPROTO_IPV6:
- seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+ seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip6.src,
ntohs(ent->dst.src_port),
@@ -821,10 +947,52 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
default:
BUG();
}
+}
+
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
+{
+ const struct xt_hashlimit_htable *ht = s->private;
+
+ spin_lock(&ent->lock);
+ /* recalculate to show accurate numbers */
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1);
+
+ dl_seq_print(ent, family, s);
+
+ spin_unlock(&ent->lock);
+ return seq_has_overflowed(s);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
+{
+ const struct xt_hashlimit_htable *ht = s->private;
+
+ spin_lock(&ent->lock);
+ /* recalculate to show accurate numbers */
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+ dl_seq_print(ent, family, s);
+
spin_unlock(&ent->lock);
return seq_has_overflowed(s);
}
+static int dl_seq_show_v1(struct seq_file *s, void *v)
+{
+ struct xt_hashlimit_htable *htable = s->private;
+ unsigned int *bucket = (unsigned int *)v;
+ struct dsthash_ent *ent;
+
+ if (!hlist_empty(&htable->hash[*bucket])) {
+ hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+ if (dl_seq_real_show_v1(ent, htable->family, s))
+ return -1;
+ }
+ return 0;
+}
+
static int dl_seq_show(struct seq_file *s, void *v)
{
struct xt_hashlimit_htable *htable = s->private;
@@ -839,6 +1007,13 @@ static int dl_seq_show(struct seq_file *s, void *v)
return 0;
}
+static const struct seq_operations dl_seq_ops_v1 = {
+ .start = dl_seq_start,
+ .next = dl_seq_next,
+ .stop = dl_seq_stop,
+ .show = dl_seq_show_v1
+};
+
static const struct seq_operations dl_seq_ops = {
.start = dl_seq_start,
.next = dl_seq_next,
@@ -846,17 +1021,37 @@ static const struct seq_operations dl_seq_ops = {
.show = dl_seq_show
};
+static int dl_proc_open_v1(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &dl_seq_ops_v1);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ sf->private = PDE_DATA(inode);
+ }
+ return ret;
+}
+
static int dl_proc_open(struct inode *inode, struct file *file)
{
int ret = seq_open(file, &dl_seq_ops);
if (!ret) {
struct seq_file *sf = file->private_data;
+
sf->private = PDE_DATA(inode);
}
return ret;
}
+static const struct file_operations dl_file_ops_v1 = {
+ .owner = THIS_MODULE,
+ .open = dl_proc_open_v1,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
static const struct file_operations dl_file_ops = {
.owner = THIS_MODULE,
.open = dl_proc_open,
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 9f4ab00..f679dd4 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -41,7 +41,7 @@ helper_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (!master_help)
return ret;
- /* rcu_read_lock()ed by nf_hook_slow */
+ /* rcu_read_lock()ed by nf_hook_thresh */
helper = rcu_dereference(master_help->helper);
if (!helper)
return ret;
@@ -65,7 +65,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
par->family);
return ret;
}
- info->name[29] = '\0';
+ info->name[sizeof(info->name) - 1] = '\0';
return 0;
}
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d725a27..e3b7a09 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -110,7 +110,6 @@ static const struct file_operations recent_old_fops, recent_mt_fops;
#endif
static u_int32_t hash_rnd __read_mostly;
-static bool hash_rnd_inited __read_mostly;
static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
{
@@ -340,10 +339,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
int ret = -EINVAL;
size_t sz;
- if (unlikely(!hash_rnd_inited)) {
- get_random_bytes(&hash_rnd, sizeof(hash_rnd));
- hash_rnd_inited = true;
- }
+ net_get_random_once(&hash_rnd, sizeof(hash_rnd));
+
if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
pr_info("Unsupported user space flags (%08x)\n",
info->check_set);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ef36a56..4dedb96 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb,
++i, offset, sch->type, htons(sch->length),
sch->flags);
#endif
- offset += WORD_ROUND(ntohs(sch->length));
+ offset += SCTP_PAD4(ntohs(sch->length));
pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 863e992..4e03f64 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -160,7 +160,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_mpls *mpls)
{
- __be32 *new_mpls_lse;
+ struct mpls_shim_hdr *new_mpls_lse;
/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
if (skb->encapsulation)
@@ -180,8 +180,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);
- new_mpls_lse = (__be32 *)skb_mpls_header(skb);
- *new_mpls_lse = mpls->mpls_lse;
+ new_mpls_lse = mpls_hdr(skb);
+ new_mpls_lse->label_stack_entry = mpls->mpls_lse;
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
@@ -202,7 +202,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
if (unlikely(err))
return err;
- skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN);
+ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
skb->mac_len);
@@ -211,10 +211,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);
- /* skb_mpls_header() is used to locate the ethertype
- * field correctly in the presence of VLAN tags.
+ /* mpls_hdr() is used to locate the ethertype field correctly in the
+ * presence of VLAN tags.
*/
- hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
update_ethertype(skb, hdr, ethertype);
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
@@ -226,7 +226,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
const __be32 *mpls_lse, const __be32 *mask)
{
- __be32 *stack;
+ struct mpls_shim_hdr *stack;
__be32 lse;
int err;
@@ -234,16 +234,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
if (unlikely(err))
return err;
- stack = (__be32 *)skb_mpls_header(skb);
- lse = OVS_MASKED(*stack, *mpls_lse, *mask);
+ stack = mpls_hdr(skb);
+ lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
if (skb->ip_summed == CHECKSUM_COMPLETE) {
- __be32 diff[] = { ~(*stack), lse };
+ __be32 diff[] = { ~(stack->label_stack_entry), lse };
skb->csum = ~csum_partial((char *)diff, sizeof(diff),
~skb->csum);
}
- *stack = lse;
+ stack->label_stack_entry = lse;
flow_key->mpls.top_lse = lse;
return 0;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0536ab3..4d67ea8 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -928,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
- struct sw_flow_key key;
struct sw_flow_actions *acts;
struct sw_flow_match match;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
@@ -956,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
/* Extract key. */
- ovs_match_init(&match, &key, &mask);
+ ovs_match_init(&match, &new_flow->key, false, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
- ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
-
/* Extract flow identifier. */
error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
- &key, log);
+ &new_flow->key, log);
if (error)
goto err_kfree_flow;
+ /* unmasked key is needed to match when ufid is not used. */
+ if (ovs_identifier_is_key(&new_flow->id))
+ match.key = new_flow->id.unmasked_key;
+
+ ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
+
/* Validate actions. */
error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
&new_flow->key, &acts, log);
@@ -996,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
- flow = ovs_flow_tbl_lookup(&dp->table, &key);
+ flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
@@ -1121,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, &mask);
+ ovs_match_init(&match, &key, true, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
} else if (!ufid_present) {
@@ -1238,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
log);
} else if (!ufid_present) {
@@ -1297,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
NULL, log);
if (unlikely(err))
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0fa45439..c8c82e1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -633,12 +633,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
} else if (eth_p_mpls(key->eth.type)) {
size_t stack_len = MPLS_HLEN;
- /* In the presence of an MPLS label stack the end of the L2
- * header and the beginning of the L3 header differ.
- *
- * Advance network_header to the beginning of the L3
- * header. mac_len corresponds to the end of the L2 header.
- */
+ skb_set_inner_network_header(skb, skb->mac_len);
while (1) {
__be32 lse;
@@ -646,12 +641,12 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
if (unlikely(error))
return 0;
- memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+ memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN);
if (stack_len == MPLS_HLEN)
memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
- skb_set_network_header(skb, skb->mac_len + stack_len);
+ skb_set_inner_network_header(skb, skb->mac_len + stack_len);
if (lse & htonl(MPLS_LS_S_MASK))
break;
@@ -767,8 +762,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
{
int err;
- memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
-
/* Extract metadata from netlink attributes. */
err = ovs_nla_get_flow_metadata(net, attr, key, log);
if (err)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 8efa718..ae25ded 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1996,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
void ovs_match_init(struct sw_flow_match *match,
struct sw_flow_key *key,
+ bool reset_key,
struct sw_flow_mask *mask)
{
memset(match, 0, sizeof(*match));
match->key = key;
match->mask = mask;
- memset(key, 0, sizeof(*key));
+ if (reset_key)
+ memset(key, 0, sizeof(*key));
if (mask) {
memset(&mask->key, 0, sizeof(mask->key));
@@ -2049,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
struct nlattr *a;
int err = 0, start, opts_type;
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
if (opts_type < 0)
return opts_type;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 47dd142..45f9769 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void);
size_t ovs_key_attr_size(void);
void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key, struct sw_flow_mask *mask);
+ struct sw_flow_key *key, bool reset_key,
+ struct sw_flow_mask *mask);
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 13396c7..86f8853 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -26,6 +26,13 @@ config AF_RXRPC_IPV6
Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as
its network transport.
+config AF_RXRPC_INJECT_LOSS
+ bool "Inject packet loss into RxRPC packet stream"
+ depends on AF_RXRPC
+ help
+ Say Y here to inject packet loss by discarding some received and some
+ transmitted packets.
+
config AF_RXRPC_DEBUG
bool "RxRPC dynamic debugging"
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 09f81bef..44c9c2b 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -45,7 +45,7 @@ u32 rxrpc_epoch;
atomic_t rxrpc_debug_id;
/* count of skbs currently in use */
-atomic_t rxrpc_n_skbs;
+atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
struct workqueue_struct *rxrpc_workqueue;
@@ -136,7 +136,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
struct sock *sk = sock->sk;
struct rxrpc_local *local;
- struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ u16 service_id = srx->srx_service;
int ret;
_enter("%p,%p,%d", rx, saddr, len);
@@ -160,15 +161,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
goto error_unlock;
}
- if (rx->srx.srx_service) {
+ if (service_id) {
write_lock(&local->services_lock);
- hlist_for_each_entry(prx, &local->services, listen_link) {
- if (prx->srx.srx_service == rx->srx.srx_service)
- goto service_in_use;
- }
-
+ if (rcu_access_pointer(local->service))
+ goto service_in_use;
rx->local = local;
- hlist_add_head_rcu(&rx->listen_link, &local->services);
+ rcu_assign_pointer(local->service, rx);
write_unlock(&local->services_lock);
rx->sk.sk_state = RXRPC_SERVER_BOUND;
@@ -599,7 +597,6 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
rx->family = protocol;
rx->calls = RB_ROOT;
- INIT_HLIST_NODE(&rx->listen_link);
spin_lock_init(&rx->incoming_lock);
INIT_LIST_HEAD(&rx->sock_calls);
INIT_LIST_HEAD(&rx->to_be_accepted);
@@ -681,11 +678,9 @@ static int rxrpc_release_sock(struct sock *sk)
sk->sk_state = RXRPC_CLOSE;
spin_unlock_bh(&sk->sk_receive_queue.lock);
- ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
-
- if (!hlist_unhashed(&rx->listen_link)) {
+ if (rx->local && rx->local->service == rx) {
write_lock(&rx->local->services_lock);
- hlist_del_rcu(&rx->listen_link);
+ rx->local->service = NULL;
write_unlock(&rx->local->services_lock);
}
@@ -867,7 +862,8 @@ static void __exit af_rxrpc_exit(void)
proto_unregister(&rxrpc_proto);
rxrpc_destroy_all_calls();
rxrpc_destroy_all_connections();
- ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
rxrpc_destroy_all_locals();
remove_proc_entry("rxrpc_conns", init_net.proc_net);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index e78c40b..d38dffd 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -93,7 +93,6 @@ struct rxrpc_sock {
rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
struct rxrpc_local *local; /* local endpoint */
- struct hlist_node listen_link; /* link in the local endpoint's listen list */
struct rxrpc_backlog *backlog; /* Preallocation for services */
spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
struct list_head sock_calls; /* List of calls owned by this socket */
@@ -142,15 +141,10 @@ struct rxrpc_host_header {
*/
struct rxrpc_skb_priv {
union {
- unsigned long resend_at; /* time in jiffies at which to resend */
- struct {
- u8 nr_jumbo; /* Number of jumbo subpackets */
- };
+ u8 nr_jumbo; /* Number of jumbo subpackets */
};
union {
- unsigned int offset; /* offset into buffer of next read */
int remain; /* amount of space remaining for next write */
- u32 error; /* network error code */
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -219,7 +213,7 @@ struct rxrpc_local {
struct list_head link;
struct socket *socket; /* my UDP socket */
struct work_struct processor;
- struct hlist_head services; /* services listening on this endpoint */
+ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
@@ -258,10 +252,12 @@ struct rxrpc_peer {
/* calculated RTT cache */
#define RXRPC_RTT_CACHE_SIZE 32
- suseconds_t rtt; /* current RTT estimate (in uS) */
- unsigned int rtt_point; /* next entry at which to insert */
- unsigned int rtt_usage; /* amount of cache actually used */
- suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+ ktime_t rtt_last_req; /* Time of last RTT request */
+ u64 rtt; /* Current RTT estimate (in nS) */
+ u64 rtt_sum; /* Sum of cache contents */
+ u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
+ u8 rtt_cursor; /* next entry at which to insert */
+ u8 rtt_usage; /* amount of cache actually used */
};
/*
@@ -314,6 +310,7 @@ enum rxrpc_conn_cache_state {
RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
+ RXRPC_CONN__NR_CACHE_STATES
};
/*
@@ -384,10 +381,9 @@ struct rxrpc_connection {
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
+ u32 security_nonce; /* response re-use preventer */
u8 size_align; /* data size alignment (for security) */
- u8 header_size; /* rxrpc + security header size */
u8 security_size; /* security header size */
- u32 security_nonce; /* response re-use preventer */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
@@ -402,6 +398,8 @@ enum rxrpc_call_flag {
RXRPC_CALL_EXPOSED, /* The call was exposed to the world */
RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
+ RXRPC_CALL_PINGING, /* Ping in process */
+ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
};
/*
@@ -447,6 +445,17 @@ enum rxrpc_call_completion {
};
/*
+ * Call Tx congestion management modes.
+ */
+enum rxrpc_congest_mode {
+ RXRPC_CALL_SLOW_START,
+ RXRPC_CALL_CONGEST_AVOIDANCE,
+ RXRPC_CALL_PACKET_LOSS,
+ RXRPC_CALL_FAST_RETRANSMIT,
+ NR__RXRPC_CONGEST_MODES
+};
+
+/*
* RxRPC call definition
* - matched by { connection, call_id }
*/
@@ -455,9 +464,9 @@ struct rxrpc_call {
struct rxrpc_connection *conn; /* connection carrying call */
struct rxrpc_peer *peer; /* Peer record for remote address */
struct rxrpc_sock __rcu *socket; /* socket responsible */
- unsigned long ack_at; /* When deferred ACK needs to happen */
- unsigned long resend_at; /* When next resend needs to happen */
- unsigned long expire_at; /* When the call times out */
+ ktime_t ack_at; /* When deferred ACK needs to happen */
+ ktime_t resend_at; /* When next resend needs to happen */
+ ktime_t expire_at; /* When the call times out */
struct timer_list timer; /* Combined event timer */
struct work_struct processor; /* Event processor */
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
@@ -486,6 +495,8 @@ struct rxrpc_call {
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
int debug_id; /* debug ID for printks */
+ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
+ unsigned short rx_pkt_len; /* Current recvmsg packet len */
/* Rx/Tx circular buffer, depending on phase.
*
@@ -505,6 +516,10 @@ struct rxrpc_call {
#define RXRPC_TX_ANNO_UNACK 1
#define RXRPC_TX_ANNO_NAK 2
#define RXRPC_TX_ANNO_RETRANS 3
+#define RXRPC_TX_ANNO_MASK 0x03
+#define RXRPC_TX_ANNO_LAST 0x04
+#define RXRPC_TX_ANNO_RESENT 0x08
+
#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */
#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */
#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
@@ -512,6 +527,20 @@ struct rxrpc_call {
* not hard-ACK'd packet follows this.
*/
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
+
+ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS
+ * is fixed, we keep these numbers in terms of segments (ie. DATA
+ * packets) rather than bytes.
+ */
+#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
+ u8 cong_cwnd; /* Congestion window size */
+ u8 cong_extra; /* Extra to send for congestion management */
+ u8 cong_ssthresh; /* Slow-start threshold */
+ enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
+ u8 cong_dup_acks; /* Count of ACKs showing missing packets */
+ u8 cong_cumul_acks; /* Cumulative ACK count */
+ ktime_t cong_tstamp; /* Last time cwnd was changed */
+
rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not
* consumed packet follows this.
*/
@@ -519,6 +548,7 @@ struct rxrpc_call {
rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
u8 rx_winsize; /* Size of Rx window */
u8 tx_winsize; /* Maximum size of Tx window */
+ bool tx_phase; /* T if transmission phase, F if receive phase */
u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
/* receive-phase ACK management */
@@ -526,19 +556,104 @@ struct rxrpc_call {
u16 ackr_skew; /* skew on packet being ACK'd */
rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
- unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
- unsigned short rx_pkt_len; /* Current recvmsg packet len */
+ rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */
+ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */
+ rxrpc_serial_t ackr_ping; /* Last ping sent */
+ ktime_t ackr_ping_time; /* Time last ping sent */
/* transmission-phase ACK management */
+ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
+ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
};
+/*
+ * Summary of a new ACK and the changes it made to the Tx buffer packet states.
+ */
+struct rxrpc_ack_summary {
+ u8 ack_reason;
+ u8 nr_acks; /* Number of ACKs in packet */
+ u8 nr_nacks; /* Number of NACKs in packet */
+ u8 nr_new_acks; /* Number of new ACKs in packet */
+ u8 nr_new_nacks; /* Number of new NACKs in packet */
+ u8 nr_rot_new_acks; /* Number of rotated new ACKs */
+ bool new_low_nack; /* T if new low NACK found */
+ bool retrans_timeo; /* T if reTx due to timeout happened */
+ u8 flight_size; /* Number of unreceived transmissions */
+ /* Place to stash values for tracing */
+ enum rxrpc_congest_mode mode:8;
+ u8 cwnd;
+ u8 ssthresh;
+ u8 dup_acks;
+ u8 cumulative_acks;
+};
+
+enum rxrpc_skb_trace {
+ rxrpc_skb_rx_cleaned,
+ rxrpc_skb_rx_freed,
+ rxrpc_skb_rx_got,
+ rxrpc_skb_rx_lost,
+ rxrpc_skb_rx_received,
+ rxrpc_skb_rx_rotated,
+ rxrpc_skb_rx_purged,
+ rxrpc_skb_rx_seen,
+ rxrpc_skb_tx_cleaned,
+ rxrpc_skb_tx_freed,
+ rxrpc_skb_tx_got,
+ rxrpc_skb_tx_new,
+ rxrpc_skb_tx_rotated,
+ rxrpc_skb_tx_seen,
+ rxrpc_skb__nr_trace
+};
+
+extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
+
+enum rxrpc_conn_trace {
+ rxrpc_conn_new_client,
+ rxrpc_conn_new_service,
+ rxrpc_conn_queued,
+ rxrpc_conn_seen,
+ rxrpc_conn_got,
+ rxrpc_conn_put_client,
+ rxrpc_conn_put_service,
+ rxrpc_conn__nr_trace
+};
+
+extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
+
+enum rxrpc_client_trace {
+ rxrpc_client_activate_chans,
+ rxrpc_client_alloc,
+ rxrpc_client_chan_activate,
+ rxrpc_client_chan_disconnect,
+ rxrpc_client_chan_pass,
+ rxrpc_client_chan_unstarted,
+ rxrpc_client_cleanup,
+ rxrpc_client_count,
+ rxrpc_client_discard,
+ rxrpc_client_duplicate,
+ rxrpc_client_exposed,
+ rxrpc_client_replace,
+ rxrpc_client_to_active,
+ rxrpc_client_to_culled,
+ rxrpc_client_to_idle,
+ rxrpc_client_to_inactive,
+ rxrpc_client_to_waiting,
+ rxrpc_client_uncount,
+ rxrpc_client__nr_trace
+};
+
+extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
+extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
+
enum rxrpc_call_trace {
rxrpc_call_new_client,
rxrpc_call_new_service,
rxrpc_call_queued,
rxrpc_call_queued_ref,
rxrpc_call_seen,
+ rxrpc_call_connected,
+ rxrpc_call_release,
rxrpc_call_got,
rxrpc_call_got_userid,
rxrpc_call_got_kernel,
@@ -546,17 +661,130 @@ enum rxrpc_call_trace {
rxrpc_call_put_userid,
rxrpc_call_put_kernel,
rxrpc_call_put_noqueue,
+ rxrpc_call_error,
rxrpc_call__nr_trace
};
extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
+enum rxrpc_transmit_trace {
+ rxrpc_transmit_wait,
+ rxrpc_transmit_queue,
+ rxrpc_transmit_queue_last,
+ rxrpc_transmit_rotate,
+ rxrpc_transmit_rotate_last,
+ rxrpc_transmit_await_reply,
+ rxrpc_transmit_end,
+ rxrpc_transmit__nr_trace
+};
+
+extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
+
+enum rxrpc_receive_trace {
+ rxrpc_receive_incoming,
+ rxrpc_receive_queue,
+ rxrpc_receive_queue_last,
+ rxrpc_receive_front,
+ rxrpc_receive_rotate,
+ rxrpc_receive_end,
+ rxrpc_receive__nr_trace
+};
+
+extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
+
+enum rxrpc_recvmsg_trace {
+ rxrpc_recvmsg_enter,
+ rxrpc_recvmsg_wait,
+ rxrpc_recvmsg_dequeue,
+ rxrpc_recvmsg_hole,
+ rxrpc_recvmsg_next,
+ rxrpc_recvmsg_cont,
+ rxrpc_recvmsg_full,
+ rxrpc_recvmsg_data_return,
+ rxrpc_recvmsg_terminal,
+ rxrpc_recvmsg_to_be_accepted,
+ rxrpc_recvmsg_return,
+ rxrpc_recvmsg__nr_trace
+};
+
+extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
+
+enum rxrpc_rtt_tx_trace {
+ rxrpc_rtt_tx_ping,
+ rxrpc_rtt_tx_data,
+ rxrpc_rtt_tx__nr_trace
+};
+
+extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
+
+enum rxrpc_rtt_rx_trace {
+ rxrpc_rtt_rx_ping_response,
+ rxrpc_rtt_rx_requested_ack,
+ rxrpc_rtt_rx__nr_trace
+};
+
+extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
+
+enum rxrpc_timer_trace {
+ rxrpc_timer_begin,
+ rxrpc_timer_init_for_reply,
+ rxrpc_timer_expired,
+ rxrpc_timer_set_for_ack,
+ rxrpc_timer_set_for_resend,
+ rxrpc_timer_set_for_send,
+ rxrpc_timer__nr_trace
+};
+
+extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
+
+enum rxrpc_propose_ack_trace {
+ rxrpc_propose_ack_client_tx_end,
+ rxrpc_propose_ack_input_data,
+ rxrpc_propose_ack_ping_for_lost_ack,
+ rxrpc_propose_ack_ping_for_lost_reply,
+ rxrpc_propose_ack_ping_for_params,
+ rxrpc_propose_ack_respond_to_ack,
+ rxrpc_propose_ack_respond_to_ping,
+ rxrpc_propose_ack_retry_tx,
+ rxrpc_propose_ack_rotate_rx,
+ rxrpc_propose_ack_terminal_ack,
+ rxrpc_propose_ack__nr_trace
+};
+
+enum rxrpc_propose_ack_outcome {
+ rxrpc_propose_ack_use,
+ rxrpc_propose_ack_update,
+ rxrpc_propose_ack_subsume,
+ rxrpc_propose_ack__nr_outcomes
+};
+
+extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
+extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
+
+enum rxrpc_congest_change {
+ rxrpc_cong_begin_retransmission,
+ rxrpc_cong_cleared_nacks,
+ rxrpc_cong_new_low_nack,
+ rxrpc_cong_no_change,
+ rxrpc_cong_progress,
+ rxrpc_cong_retransmit_again,
+ rxrpc_cong_rtt_window_end,
+ rxrpc_cong_saw_nack,
+ rxrpc_congest__nr_change
+};
+
+extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
+extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
+
+extern const char *const rxrpc_pkts[];
+extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
+
#include <trace/events/rxrpc.h>
/*
* af_rxrpc.c
*/
-extern atomic_t rxrpc_n_skbs;
+extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
extern u32 rxrpc_epoch;
extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
@@ -577,7 +805,9 @@ int rxrpc_reject_call(struct rxrpc_sock *);
/*
* call_event.c
*/
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
+void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
+ enum rxrpc_propose_ack_trace);
void rxrpc_process_call(struct work_struct *);
/*
@@ -631,6 +861,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
call->error = error;
call->completion = compl,
call->state = RXRPC_CALL_COMPLETE;
+ wake_up(&call->waitq);
return true;
}
return false;
@@ -728,7 +959,11 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
void rxrpc_disconnect_call(struct rxrpc_call *);
void rxrpc_kill_connection(struct rxrpc_connection *);
-void __rxrpc_put_connection(struct rxrpc_connection *);
+bool rxrpc_queue_conn(struct rxrpc_connection *);
+void rxrpc_see_connection(struct rxrpc_connection *);
+void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
+void rxrpc_put_service_conn(struct rxrpc_connection *);
void __exit rxrpc_destroy_all_connections(void);
static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
@@ -741,38 +976,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
return !rxrpc_conn_is_client(conn);
}
-static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
-{
- atomic_inc(&conn->usage);
-}
-
-static inline
-struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
-{
- return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
-}
-
static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
{
if (!conn)
return;
- if (rxrpc_conn_is_client(conn)) {
- if (atomic_dec_and_test(&conn->usage))
- rxrpc_put_client_conn(conn);
- } else {
- if (atomic_dec_return(&conn->usage) == 1)
- __rxrpc_put_connection(conn);
- }
-}
-
-static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
-{
- if (!rxrpc_get_connection_maybe(conn))
- return false;
- if (!rxrpc_queue_work(&conn->processor))
- rxrpc_put_connection(conn);
- return true;
+ if (rxrpc_conn_is_client(conn))
+ rxrpc_put_client_conn(conn);
+ else
+ rxrpc_put_service_conn(conn);
}
/*
@@ -851,16 +1063,13 @@ extern unsigned int rxrpc_rx_mtu;
extern unsigned int rxrpc_rx_jumbo_max;
extern unsigned int rxrpc_resend_timeout;
-extern const char *const rxrpc_pkts[];
extern const s8 rxrpc_ack_priority[];
-extern const char *rxrpc_acks(u8 reason);
-
/*
* output.c
*/
int rxrpc_send_call_packet(struct rxrpc_call *, u8);
-int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
void rxrpc_reject_packets(struct rxrpc_local *);
/*
@@ -868,6 +1077,8 @@ void rxrpc_reject_packets(struct rxrpc_local *);
*/
void rxrpc_error_report(struct sock *);
void rxrpc_peer_error_distributor(struct work_struct *);
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+ rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
/*
* peer_object.c
@@ -936,10 +1147,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
*/
void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
void rxrpc_packet_destructor(struct sk_buff *);
-void rxrpc_new_skb(struct sk_buff *);
-void rxrpc_see_skb(struct sk_buff *);
-void rxrpc_get_skb(struct sk_buff *);
-void rxrpc_free_skb(struct sk_buff *);
+void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
/*
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 26c293e..3cac231 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -85,6 +85,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
b->conn_backlog[head] = conn;
smp_store_release(&b->conn_backlog_head,
(head + 1) & (size - 1));
+
+ trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ atomic_read(&conn->usage), here);
}
/* Now it gets complicated, because calls get registered with the
@@ -290,6 +293,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
rxrpc_get_local(local);
conn->params.local = local;
conn->params.peer = peer;
+ rxrpc_see_connection(conn);
rxrpc_new_incoming_connection(conn, skb);
} else {
rxrpc_get_connection(conn);
@@ -327,14 +331,14 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_sock *rx;
struct rxrpc_call *call;
+ u16 service_id = sp->hdr.serviceId;
_enter("");
/* Get the socket providing the service */
- hlist_for_each_entry_rcu_bh(rx, &local->services, listen_link) {
- if (rx->srx.srx_service == sp->hdr.serviceId)
- goto found_service;
- }
+ rx = rcu_dereference(local->service);
+ if (service_id == rx->srx.srx_service)
+ goto found_service;
trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
RX_INVALID_OPERATION, EOPNOTSUPP);
@@ -363,12 +367,17 @@ found_service:
goto out;
}
+ trace_rxrpc_receive(call, rxrpc_receive_incoming,
+ sp->hdr.serial, sp->hdr.seq);
+
/* Make the call live. */
rxrpc_incoming_call(rx, call, skb);
conn = call->conn;
if (rx->notify_new_call)
rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+ else
+ sk_acceptq_added(&rx->sk);
spin_lock(&conn->state_lock);
switch (conn->state) {
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6143204..4f00476 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -24,28 +24,56 @@
/*
* Set the timer
*/
-static void rxrpc_set_timer(struct rxrpc_call *call)
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+ ktime_t now)
{
- unsigned long t, now = jiffies;
+ unsigned long t_j, now_j = jiffies;
+ ktime_t t;
+ bool queue = false;
- _enter("{%ld,%ld,%ld:%ld}",
- call->ack_at - now, call->resend_at - now, call->expire_at - now,
- call->timer.expires - now);
-
read_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE) {
- t = call->ack_at;
- if (time_before(call->resend_at, t))
+ t = call->expire_at;
+ if (!ktime_after(t, now))
+ goto out;
+
+ if (!ktime_after(call->resend_at, now)) {
+ call->resend_at = call->expire_at;
+ if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ queue = true;
+ } else if (ktime_before(call->resend_at, t)) {
t = call->resend_at;
- if (time_before(call->expire_at, t))
- t = call->expire_at;
- if (!timer_pending(&call->timer) ||
- time_before(t, call->timer.expires)) {
- _debug("set timer %ld", t - now);
- mod_timer(&call->timer, t);
}
+
+ if (!ktime_after(call->ack_at, now)) {
+ call->ack_at = call->expire_at;
+ if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
+ queue = true;
+ } else if (ktime_before(call->ack_at, t)) {
+ t = call->ack_at;
+ }
+
+ t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
+ t_j += jiffies;
+
+ /* We have to make sure that the calculated jiffies value falls
+ * at or after the nsec value, or we may loop ceaselessly
+ * because the timer times out, but we haven't reached the nsec
+ * timeout yet.
+ */
+ t_j++;
+
+ if (call->timer.expires != t_j || !timer_pending(&call->timer)) {
+ mod_timer(&call->timer, t_j);
+ trace_rxrpc_timer(call, why, now, now_j);
+ }
+
+ if (queue)
+ rxrpc_queue_call(call);
}
+
+out:
read_unlock_bh(&call->state_lock);
}
@@ -54,14 +82,14 @@ static void rxrpc_set_timer(struct rxrpc_call *call)
*/
static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
u16 skew, u32 serial, bool immediate,
- bool background)
+ bool background,
+ enum rxrpc_propose_ack_trace why)
{
- unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
+ enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
+ unsigned int expiry = rxrpc_soft_ack_delay;
+ ktime_t now, ack_at;
s8 prior = rxrpc_ack_priority[ack_reason];
- _enter("{%d},%s,%%%x,%u",
- call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
-
/* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
* numbers, but we don't alter the timeout.
*/
@@ -70,15 +98,18 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
if (ack_reason == call->ackr_reason) {
if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+ outcome = rxrpc_propose_ack_update;
call->ackr_serial = serial;
call->ackr_skew = skew;
}
if (!immediate)
- return;
+ goto trace;
} else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
call->ackr_reason = ack_reason;
call->ackr_serial = serial;
call->ackr_skew = skew;
+ } else {
+ outcome = rxrpc_propose_ack_subsume;
}
switch (ack_reason) {
@@ -94,6 +125,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
expiry = rxrpc_soft_ack_delay;
break;
+ case RXRPC_ACK_PING:
case RXRPC_ACK_IDLE:
if (rxrpc_idle_ack_delay < expiry)
expiry = rxrpc_idle_ack_delay;
@@ -104,7 +136,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
break;
}
- now = jiffies;
if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
_debug("already scheduled");
} else if (immediate || expiry == 0) {
@@ -113,42 +144,56 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
background)
rxrpc_queue_call(call);
} else {
- ack_at = now + expiry;
- _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
- if (time_before(ack_at, call->ack_at)) {
+ now = ktime_get_real();
+ ack_at = ktime_add_ms(now, expiry);
+ if (ktime_before(ack_at, call->ack_at)) {
call->ack_at = ack_at;
- rxrpc_set_timer(call);
+ rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now);
}
}
+
+trace:
+ trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
+ background, outcome);
}
/*
* propose an ACK be sent, locking the call structure
*/
void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u16 skew, u32 serial, bool immediate, bool background)
+ u16 skew, u32 serial, bool immediate, bool background,
+ enum rxrpc_propose_ack_trace why)
{
spin_lock_bh(&call->lock);
__rxrpc_propose_ACK(call, ack_reason, skew, serial,
- immediate, background);
+ immediate, background, why);
spin_unlock_bh(&call->lock);
}
/*
+ * Handle congestion being detected by the retransmit timeout.
+ */
+static void rxrpc_congestion_timeout(struct rxrpc_call *call)
+{
+ set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
+}
+
+/*
* Perform retransmission of NAK'd and unack'd packets.
*/
-static void rxrpc_resend(struct rxrpc_call *call)
+static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
{
- struct rxrpc_wire_header *whdr;
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
rxrpc_seq_t cursor, seq, top;
- unsigned long resend_at, now;
+ ktime_t max_age, oldest, ack_ts;
int ix;
- u8 annotation;
+ u8 annotation, anno_type, retrans = 0, unacked = 0;
_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
+ max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
+
spin_lock_bh(&call->lock);
cursor = call->tx_hard_ack;
@@ -161,68 +206,82 @@ static void rxrpc_resend(struct rxrpc_call *call)
* the packets in the Tx buffer we're going to resend and what the new
* resend timeout will be.
*/
- now = jiffies;
- resend_at = now + rxrpc_resend_timeout;
- seq = cursor + 1;
- do {
+ oldest = now;
+ for (seq = cursor + 1; before_eq(seq, top); seq++) {
ix = seq & RXRPC_RXTX_BUFF_MASK;
annotation = call->rxtx_annotations[ix];
- if (annotation == RXRPC_TX_ANNO_ACK)
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ if (anno_type == RXRPC_TX_ANNO_ACK)
continue;
skb = call->rxtx_buffer[ix];
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
sp = rxrpc_skb(skb);
- if (annotation == RXRPC_TX_ANNO_UNACK) {
- if (time_after(sp->resend_at, now)) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
+ if (anno_type == RXRPC_TX_ANNO_UNACK) {
+ if (ktime_after(skb->tstamp, max_age)) {
+ if (ktime_before(skb->tstamp, oldest))
+ oldest = skb->tstamp;
continue;
}
+ if (!(annotation & RXRPC_TX_ANNO_RESENT))
+ unacked++;
}
/* Okay, we need to retransmit a packet. */
- call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
- seq++;
- } while (before_eq(seq, top));
+ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+ retrans++;
+ trace_rxrpc_retransmit(call, seq, annotation | anno_type,
+ ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
+ }
+
+ call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
- call->resend_at = resend_at;
+ if (unacked)
+ rxrpc_congestion_timeout(call);
+
+ /* If there was nothing that needed retransmission then it's likely
+ * that an ACK got lost somewhere. Send a ping to find out instead of
+ * retransmitting data.
+ */
+ if (!retrans) {
+ rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+ spin_unlock_bh(&call->lock);
+ ack_ts = ktime_sub(now, call->acks_latest_ts);
+ if (ktime_to_ns(ack_ts) < call->peer->rtt)
+ goto out;
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+ rxrpc_propose_ack_ping_for_lost_ack);
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+ goto out;
+ }
/* Now go through the Tx window and perform the retransmissions. We
* have to drop the lock for each send. If an ACK comes in whilst the
* lock is dropped, it may clear some of the retransmission markers for
* packets that it soft-ACKs.
*/
- seq = cursor + 1;
- do {
+ for (seq = cursor + 1; before_eq(seq, top); seq++) {
ix = seq & RXRPC_RXTX_BUFF_MASK;
annotation = call->rxtx_annotations[ix];
- if (annotation != RXRPC_TX_ANNO_RETRANS)
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ if (anno_type != RXRPC_TX_ANNO_RETRANS)
continue;
skb = call->rxtx_buffer[ix];
- rxrpc_get_skb(skb);
+ rxrpc_get_skb(skb, rxrpc_skb_tx_got);
spin_unlock_bh(&call->lock);
- sp = rxrpc_skb(skb);
-
- /* Each Tx packet needs a new serial number */
- sp->hdr.serial = atomic_inc_return(&call->conn->serial);
- whdr = (struct rxrpc_wire_header *)skb->head;
- whdr->serial = htonl(sp->hdr.serial);
-
- if (rxrpc_send_data_packet(call->conn, skb) < 0) {
- call->resend_at = now + 2;
- rxrpc_free_skb(skb);
+ if (rxrpc_send_data_packet(call, skb, true) < 0) {
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
return;
}
if (rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
- sp->resend_at = now + rxrpc_resend_timeout;
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
spin_lock_bh(&call->lock);
/* We need to clear the retransmit state, but there are two
@@ -230,18 +289,25 @@ static void rxrpc_resend(struct rxrpc_call *call)
* received and the packet might have been hard-ACK'd (in which
* case it will no longer be in the buffer).
*/
- if (after(seq, call->tx_hard_ack) &&
- (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_RETRANS ||
- call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK))
- call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
+ if (after(seq, call->tx_hard_ack)) {
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ if (anno_type == RXRPC_TX_ANNO_RETRANS ||
+ anno_type == RXRPC_TX_ANNO_NAK) {
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ annotation |= RXRPC_TX_ANNO_UNACK;
+ }
+ annotation |= RXRPC_TX_ANNO_RESENT;
+ call->rxtx_annotations[ix] = annotation;
+ }
if (after(call->tx_hard_ack, seq))
seq = call->tx_hard_ack;
- seq++;
- } while (before_eq(seq, top));
+ }
out_unlock:
spin_unlock_bh(&call->lock);
+out:
_leave("");
}
@@ -252,7 +318,7 @@ void rxrpc_process_call(struct work_struct *work)
{
struct rxrpc_call *call =
container_of(work, struct rxrpc_call, processor);
- unsigned long now;
+ ktime_t now;
rxrpc_see_call(call);
@@ -271,14 +337,14 @@ recheck_state:
goto out_put;
}
- now = jiffies;
- if (time_after_eq(now, call->expire_at)) {
+ now = ktime_get_real();
+ if (ktime_before(call->expire_at, now)) {
rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ goto recheck_state;
}
- if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
- time_after_eq(now, call->ack_at)) {
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) {
call->ack_at = call->expire_at;
if (call->ackr_reason) {
rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
@@ -286,13 +352,12 @@ recheck_state:
}
}
- if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) ||
- time_after_eq(now, call->resend_at)) {
- rxrpc_resend(call);
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) {
+ rxrpc_resend(call, now);
goto recheck_state;
}
- rxrpc_set_timer(call);
+ rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
/* other events may have been raised since we started checking */
if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 22f9b0d..364b42d 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -19,11 +19,6 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-/*
- * Maximum lifetime of a call (in jiffies).
- */
-unsigned int rxrpc_max_call_lifetime = 60 * HZ;
-
const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
[RXRPC_CALL_UNINITIALISED] = "Uninit ",
[RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn",
@@ -53,6 +48,8 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
[rxrpc_call_new_service] = "NWs",
[rxrpc_call_queued] = "QUE",
[rxrpc_call_queued_ref] = "QUR",
+ [rxrpc_call_connected] = "CON",
+ [rxrpc_call_release] = "RLS",
[rxrpc_call_seen] = "SEE",
[rxrpc_call_got] = "GOT",
[rxrpc_call_got_userid] = "Gus",
@@ -61,6 +58,7 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
[rxrpc_call_put_userid] = "Pus",
[rxrpc_call_put_kernel] = "Pke",
[rxrpc_call_put_noqueue] = "PNQ",
+ [rxrpc_call_error] = "*E*",
};
struct kmem_cache *rxrpc_call_jar;
@@ -74,7 +72,7 @@ static void rxrpc_call_timer_expired(unsigned long _call)
_enter("%d", call->debug_id);
if (call->state < RXRPC_CALL_COMPLETE)
- rxrpc_queue_call(call);
+ rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
}
/*
@@ -155,6 +153,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
call->rx_winsize = rxrpc_rx_window_size;
call->tx_winsize = 16;
call->rx_expect_next = 1;
+
+ if (RXRPC_TX_SMSS > 2190)
+ call->cong_cwnd = 2;
+ else if (RXRPC_TX_SMSS > 1095)
+ call->cong_cwnd = 3;
+ else
+ call->cong_cwnd = 4;
+ call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
return call;
nomem_2:
@@ -171,6 +177,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
gfp_t gfp)
{
struct rxrpc_call *call;
+ ktime_t now;
_enter("");
@@ -179,6 +186,10 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
return ERR_PTR(-ENOMEM);
call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
call->service_id = srx->srx_service;
+ call->tx_phase = true;
+ now = ktime_get_real();
+ call->acks_latest_ts = now;
+ call->cong_tstamp = now;
_leave(" = %p", call);
return call;
@@ -189,14 +200,14 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
*/
static void rxrpc_start_call_timer(struct rxrpc_call *call)
{
- unsigned long expire_at;
+ ktime_t now = ktime_get_real(), expire_at;
- expire_at = jiffies + rxrpc_max_call_lifetime;
+ expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
call->expire_at = expire_at;
call->ack_at = expire_at;
call->resend_at = expire_at;
- call->timer.expires = expire_at;
- add_timer(&call->timer);
+ call->timer.expires = jiffies + LONG_MAX / 2;
+ rxrpc_set_timer(call, rxrpc_timer_begin, now);
}
/*
@@ -222,13 +233,10 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
return call;
}
- trace_rxrpc_call(call, 0, atomic_read(&call->usage), here,
- (const void *)user_call_ID);
+ trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
+ here, (const void *)user_call_ID);
/* Publish the call, even though it is incompletely set up as yet */
- call->user_call_ID = user_call_ID;
- __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
-
write_lock(&rx->call_lock);
pp = &rx->calls.rb_node;
@@ -242,10 +250,12 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
else if (user_call_ID > xcall->user_call_ID)
pp = &(*pp)->rb_right;
else
- goto found_user_ID_now_present;
+ goto error_dup_user_ID;
}
rcu_assign_pointer(call->socket, rx);
+ call->user_call_ID = user_call_ID;
+ __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
rxrpc_get_call(call, rxrpc_call_got_userid);
rb_link_node(&call->sock_node, parent, pp);
rb_insert_color(&call->sock_node, &rx->calls);
@@ -264,6 +274,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
if (ret < 0)
goto error;
+ trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
+ here, ERR_PTR(ret));
+
spin_lock_bh(&call->conn->params.peer->lock);
hlist_add_head(&call->error_link,
&call->conn->params.peer->error_targets);
@@ -276,33 +289,24 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
_leave(" = %p [new]", call);
return call;
-error:
- write_lock(&rx->call_lock);
- rb_erase(&call->sock_node, &rx->calls);
- write_unlock(&rx->call_lock);
- rxrpc_put_call(call, rxrpc_call_put_userid);
-
- write_lock(&rxrpc_call_lock);
- list_del_init(&call->link);
- write_unlock(&rxrpc_call_lock);
-
-error_out:
- __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
- RX_CALL_DEAD, ret);
- set_bit(RXRPC_CALL_RELEASED, &call->flags);
- rxrpc_put_call(call, rxrpc_call_put);
- _leave(" = %d", ret);
- return ERR_PTR(ret);
-
/* We unexpectedly found the user ID in the list after taking
* the call_lock. This shouldn't happen unless the user races
* with itself and tries to add the same user ID twice at the
* same time in different threads.
*/
-found_user_ID_now_present:
+error_dup_user_ID:
write_unlock(&rx->call_lock);
ret = -EEXIST;
- goto error_out;
+
+error:
+ __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+ RX_CALL_DEAD, ret);
+ trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
+ here, ERR_PTR(ret));
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
}
/*
@@ -326,6 +330,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
call->state = RXRPC_CALL_SERVER_ACCEPTING;
if (sp->hdr.securityIndex > 0)
call->state = RXRPC_CALL_SERVER_SECURING;
+ call->cong_tstamp = skb->tstamp;
/* Set the channel for this call. We don't get channel_lock as we're
* only defending against the data_ready handler (which we're called
@@ -408,15 +413,17 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
*/
void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
{
+ const void *here = __builtin_return_address(0);
struct rxrpc_connection *conn = call->conn;
bool put = false;
int i;
_enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
- ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
+ trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
+ here, (const void *)call->flags);
- rxrpc_see_call(call);
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
spin_lock_bh(&call->lock);
if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
@@ -460,7 +467,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
rxrpc_disconnect_call(call);
for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
- rxrpc_free_skb(call->rxtx_buffer[i]);
+ rxrpc_free_skb(call->rxtx_buffer[i],
+ (call->tx_phase ? rxrpc_skb_tx_cleaned :
+ rxrpc_skb_rx_cleaned));
call->rxtx_buffer[i] = NULL;
}
@@ -476,6 +485,14 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
_enter("%p", rx);
+ while (!list_empty(&rx->to_be_accepted)) {
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
+ list_del(&call->accept_link);
+ rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
+ rxrpc_put_call(call, rxrpc_call_put);
+ }
+
while (!list_empty(&rx->sock_calls)) {
call = list_entry(rx->sock_calls.next,
struct rxrpc_call, sock_link);
@@ -546,9 +563,11 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
/* Clean up the Rx/Tx buffer */
for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
- rxrpc_free_skb(call->rxtx_buffer[i]);
+ rxrpc_free_skb(call->rxtx_buffer[i],
+ (call->tx_phase ? rxrpc_skb_tx_cleaned :
+ rxrpc_skb_rx_cleaned));
- rxrpc_free_skb(call->tx_pending);
+ rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 9344a84..60ef960 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -105,6 +105,14 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
rxrpc_discard_expired_client_conns);
+const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
+ [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
+ [RXRPC_CONN_CLIENT_WAITING] = "Wait",
+ [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
+ [RXRPC_CONN_CLIENT_CULLED] = "Cull",
+ [RXRPC_CONN_CLIENT_IDLE] = "Idle",
+};
+
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
@@ -192,7 +200,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
}
atomic_set(&conn->usage, 1);
- if (conn->params.exclusive)
+ if (cp->exclusive)
__set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
conn->params = *cp;
@@ -220,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
rxrpc_get_local(conn->params.local);
key_get(conn->params.key);
+ trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
+ __builtin_return_address(0));
+ trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
_leave(" = %p", conn);
return conn;
@@ -385,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
rb_replace_node(&conn->client_node,
&candidate->client_node,
&local->client_conns);
+ trace_rxrpc_client(conn, -1, rxrpc_client_replace);
goto candidate_published;
}
}
@@ -409,8 +421,11 @@ found_extant_conn:
_debug("found conn");
spin_unlock(&local->client_conns_lock);
- rxrpc_put_connection(candidate);
- candidate = NULL;
+ if (candidate) {
+ trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+ rxrpc_put_connection(candidate);
+ candidate = NULL;
+ }
spin_lock(&conn->channel_lock);
call->conn = conn;
@@ -433,6 +448,7 @@ error:
*/
static void rxrpc_activate_conn(struct rxrpc_connection *conn)
{
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
rxrpc_nr_active_client_conns++;
list_move_tail(&conn->cache_link, &rxrpc_active_client_conns);
@@ -462,8 +478,10 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn)
spin_lock(&rxrpc_client_conn_cache_lock);
nr_conns = rxrpc_nr_client_conns;
- if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags))
+ if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_count);
rxrpc_nr_client_conns = nr_conns + 1;
+ }
switch (conn->cache_state) {
case RXRPC_CONN_CLIENT_ACTIVE:
@@ -494,6 +512,7 @@ activate_conn:
wait_for_capacity:
_debug("wait");
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns);
goto out_unlock;
@@ -524,6 +543,8 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
struct rxrpc_call, chan_wait_link);
u32 call_id = chan->call_counter + 1;
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
+
write_lock_bh(&call->state_lock);
call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
write_unlock_bh(&call->state_lock);
@@ -555,26 +576,42 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
}
/*
+ * Assign channels and callNumbers to waiting calls with channel_lock
+ * held by caller.
+ */
+static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn)
+{
+ u8 avail, mask;
+
+ switch (conn->cache_state) {
+ case RXRPC_CONN_CLIENT_ACTIVE:
+ mask = RXRPC_ACTIVE_CHANS_MASK;
+ break;
+ default:
+ return;
+ }
+
+ while (!list_empty(&conn->waiting_calls) &&
+ (avail = ~conn->active_chans,
+ avail &= mask,
+ avail != 0))
+ rxrpc_activate_one_channel(conn, __ffs(avail));
+}
+
+/*
* Assign channels and callNumbers to waiting calls.
*/
static void rxrpc_activate_channels(struct rxrpc_connection *conn)
{
- unsigned char mask;
-
_enter("%d", conn->debug_id);
- if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE ||
- conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
+ trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+
+ if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
return;
spin_lock(&conn->channel_lock);
-
- while (!list_empty(&conn->waiting_calls) &&
- (mask = ~conn->active_chans,
- mask &= RXRPC_ACTIVE_CHANS_MASK,
- mask != 0))
- rxrpc_activate_one_channel(conn, __ffs(mask));
-
+ rxrpc_activate_channels_locked(conn);
spin_unlock(&conn->channel_lock);
_leave("");
}
@@ -657,10 +694,13 @@ int rxrpc_connect_call(struct rxrpc_call *call,
* had a chance at re-use (the per-connection security negotiation is
* expensive).
*/
-static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
+static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
+ unsigned int channel)
{
- if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags))
+ if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
rxrpc_get_connection(conn);
+ }
}
/*
@@ -669,9 +709,9 @@ static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
*/
void rxrpc_expose_client_call(struct rxrpc_call *call)
{
+ unsigned int channel = call->cid & RXRPC_CHANNELMASK;
struct rxrpc_connection *conn = call->conn;
- struct rxrpc_channel *chan =
- &conn->channels[call->cid & RXRPC_CHANNELMASK];
+ struct rxrpc_channel *chan = &conn->channels[channel];
if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
/* Mark the call ID as being used. If the callNumber counter
@@ -682,7 +722,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
chan->call_counter++;
if (chan->call_counter >= INT_MAX)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
- rxrpc_expose_client_conn(conn);
+ rxrpc_expose_client_conn(conn, channel);
}
}
@@ -695,6 +735,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
struct rxrpc_connection *conn = call->conn;
struct rxrpc_channel *chan = &conn->channels[channel];
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
call->conn = NULL;
spin_lock(&conn->channel_lock);
@@ -709,6 +750,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
list_del_init(&call->chan_wait_link);
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
+
/* We must deactivate or idle the connection if it's now
* waiting for nothing.
*/
@@ -721,7 +764,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
}
ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
- ASSERTCMP(atomic_read(&conn->usage), >=, 2);
/* If a client call was exposed to the world, we save the result for
* retransmission.
@@ -740,7 +782,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
/* See if we can pass the channel directly to another call. */
if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
!list_empty(&conn->waiting_calls)) {
- _debug("pass chan");
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
rxrpc_activate_one_channel(conn, channel);
goto out_2;
}
@@ -763,7 +805,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
goto out;
}
- _debug("pass chan 2");
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
rxrpc_activate_one_channel(conn, channel);
goto out;
@@ -795,7 +837,7 @@ idle_connection:
* immediately or moved to the idle list for a short while.
*/
if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- _debug("make idle");
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
conn->idle_timestamp = jiffies;
conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns);
@@ -805,7 +847,7 @@ idle_connection:
&rxrpc_client_conn_reap,
rxrpc_conn_idle_client_expiry);
} else {
- _debug("make inactive");
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
list_del_init(&conn->cache_link);
}
@@ -818,10 +860,12 @@ idle_connection:
static struct rxrpc_connection *
rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next;
+ struct rxrpc_connection *next = NULL;
struct rxrpc_local *local = conn->params.local;
unsigned int nr_conns;
+ trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+
if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
spin_lock(&local->client_conns_lock);
if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
@@ -834,24 +878,23 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
- if (!test_bit(RXRPC_CONN_COUNTED, &conn->flags))
- return NULL;
-
- spin_lock(&rxrpc_client_conn_cache_lock);
- nr_conns = --rxrpc_nr_client_conns;
+ if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ nr_conns = --rxrpc_nr_client_conns;
+
+ if (nr_conns < rxrpc_max_client_connections &&
+ !list_empty(&rxrpc_waiting_client_conns)) {
+ next = list_entry(rxrpc_waiting_client_conns.next,
+ struct rxrpc_connection, cache_link);
+ rxrpc_get_connection(next);
+ rxrpc_activate_conn(next);
+ }
- next = NULL;
- if (nr_conns < rxrpc_max_client_connections &&
- !list_empty(&rxrpc_waiting_client_conns)) {
- next = list_entry(rxrpc_waiting_client_conns.next,
- struct rxrpc_connection, cache_link);
- rxrpc_get_connection(next);
- rxrpc_activate_conn(next);
+ spin_unlock(&rxrpc_client_conn_cache_lock);
}
- spin_unlock(&rxrpc_client_conn_cache_lock);
rxrpc_kill_connection(conn);
-
if (next)
rxrpc_activate_channels(next);
@@ -866,20 +909,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
*/
void rxrpc_put_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next;
+ const void *here = __builtin_return_address(0);
+ int n;
do {
- _enter("%p{u=%d,d=%d}",
- conn, atomic_read(&conn->usage), conn->debug_id);
-
- next = rxrpc_put_one_client_conn(conn);
-
- if (!next)
- break;
- conn = next;
- } while (atomic_dec_and_test(&conn->usage));
-
- _leave("");
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
+ if (n > 0)
+ return;
+ ASSERTCMP(n, >=, 0);
+
+ conn = rxrpc_put_one_client_conn(conn);
+ } while (conn);
}
/*
@@ -910,9 +951,11 @@ static void rxrpc_cull_active_client_conns(void)
ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE);
if (list_empty(&conn->waiting_calls)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
list_del_init(&conn->cache_link);
} else {
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
list_move_tail(&conn->cache_link,
&rxrpc_waiting_client_conns);
@@ -986,7 +1029,7 @@ next:
goto not_yet_expired;
}
- _debug("discard conn %d", conn->debug_id);
+ trace_rxrpc_client(conn, -1, rxrpc_client_discard);
if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
BUG();
conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 0691007..3f9d8d7 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -97,6 +97,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
pkt.info.maxMTU = htonl(mtu);
pkt.info.rwind = htonl(rxrpc_rx_window_size);
pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+ pkt.whdr.flags |= RXRPC_SLOW_START_OK;
len += sizeof(pkt.ack) + sizeof(pkt.info);
break;
}
@@ -119,6 +120,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
break;
case RXRPC_PACKET_TYPE_ACK:
+ trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
+ RXRPC_ACK_DUPLICATE, 0);
_proto("Tx ACK %%%u [re]", serial);
break;
}
@@ -273,7 +276,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
return 0;
case RXRPC_PACKET_TYPE_ABORT:
- if (skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &wtmp, sizeof(wtmp)) < 0)
return -EPROTO;
abort_code = ntohl(wtmp);
_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
@@ -377,7 +381,7 @@ void rxrpc_process_connection(struct work_struct *work)
u32 abort_code = RX_PROTOCOL_ERROR;
int ret;
- _enter("{%d}", conn->debug_id);
+ rxrpc_see_connection(conn);
if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
rxrpc_secure_connection(conn);
@@ -385,7 +389,7 @@ void rxrpc_process_connection(struct work_struct *work)
/* go through the conn-level event packets, releasing the ref on this
* connection that each one has when we've finished with it */
while ((skb = skb_dequeue(&conn->rx_queue))) {
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
ret = rxrpc_process_event(conn, skb, &abort_code);
switch (ret) {
case -EPROTO:
@@ -396,7 +400,7 @@ void rxrpc_process_connection(struct work_struct *work)
goto requeue_and_leave;
case -ECONNABORTED:
default:
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
break;
}
}
@@ -413,7 +417,7 @@ requeue_and_leave:
protocol_error:
if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
goto requeue_and_leave;
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
_leave(" [EPROTO]");
goto out;
}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index bb1f292..e1e83af 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -53,7 +53,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
spin_lock_init(&conn->state_lock);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
conn->size_align = 4;
- conn->header_size = sizeof(struct rxrpc_wire_header);
conn->idle_timestamp = jiffies;
}
@@ -246,11 +245,77 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn)
}
/*
- * release a virtual connection
+ * Queue a connection's work processor, getting a ref to pass to the work
+ * queue.
*/
-void __rxrpc_put_connection(struct rxrpc_connection *conn)
+bool rxrpc_queue_conn(struct rxrpc_connection *conn)
{
- rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ const void *here = __builtin_return_address(0);
+ int n = __atomic_add_unless(&conn->usage, 1, 0);
+ if (n == 0)
+ return false;
+ if (rxrpc_queue_work(&conn->processor))
+ trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
+ else
+ rxrpc_put_connection(conn);
+ return true;
+}
+
+/*
+ * Note the re-emergence of a connection.
+ */
+void rxrpc_see_connection(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ if (conn) {
+ int n = atomic_read(&conn->usage);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
+ }
+}
+
+/*
+ * Get a ref on a connection.
+ */
+void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(&conn->usage);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
+}
+
+/*
+ * Try to get a ref on a connection.
+ */
+struct rxrpc_connection *
+rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+
+ if (conn) {
+ int n = __atomic_add_unless(&conn->usage, 1, 0);
+ if (n > 0)
+ trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
+ else
+ conn = NULL;
+ }
+ return conn;
+}
+
+/*
+ * Release a service connection
+ */
+void rxrpc_put_service_conn(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n;
+
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
+ ASSERTCMP(n, >=, 0);
+ if (n == 0)
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
}
/*
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 83d54da..eef551f 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -136,6 +136,10 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
list_add_tail(&conn->link, &rxrpc_connections);
list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
write_unlock(&rxrpc_connection_lock);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ atomic_read(&conn->usage),
+ __builtin_return_address(0));
}
return conn;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 75af0bd..3ad9f75 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -37,12 +37,198 @@ static void rxrpc_proto_abort(const char *why,
}
/*
+ * Do TCP-style congestion management [RFC 5681].
+ */
+static void rxrpc_congestion_management(struct rxrpc_call *call,
+ struct sk_buff *skb,
+ struct rxrpc_ack_summary *summary,
+ rxrpc_serial_t acked_serial)
+{
+ enum rxrpc_congest_change change = rxrpc_cong_no_change;
+ unsigned int cumulative_acks = call->cong_cumul_acks;
+ unsigned int cwnd = call->cong_cwnd;
+ bool resend = false;
+
+ summary->flight_size =
+ (call->tx_top - call->tx_hard_ack) - summary->nr_acks;
+
+ if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
+ summary->retrans_timeo = true;
+ call->cong_ssthresh = max_t(unsigned int,
+ summary->flight_size / 2, 2);
+ cwnd = 1;
+ if (cwnd >= call->cong_ssthresh &&
+ call->cong_mode == RXRPC_CALL_SLOW_START) {
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_tstamp = skb->tstamp;
+ cumulative_acks = 0;
+ }
+ }
+
+ cumulative_acks += summary->nr_new_acks;
+ cumulative_acks += summary->nr_rot_new_acks;
+ if (cumulative_acks > 255)
+ cumulative_acks = 255;
+
+ summary->mode = call->cong_mode;
+ summary->cwnd = call->cong_cwnd;
+ summary->ssthresh = call->cong_ssthresh;
+ summary->cumulative_acks = cumulative_acks;
+ summary->dup_acks = call->cong_dup_acks;
+
+ switch (call->cong_mode) {
+ case RXRPC_CALL_SLOW_START:
+ if (summary->nr_nacks > 0)
+ goto packet_loss_detected;
+ if (summary->cumulative_acks > 0)
+ cwnd += 1;
+ if (cwnd >= call->cong_ssthresh) {
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_tstamp = skb->tstamp;
+ }
+ goto out;
+
+ case RXRPC_CALL_CONGEST_AVOIDANCE:
+ if (summary->nr_nacks > 0)
+ goto packet_loss_detected;
+
+ /* We analyse the number of packets that get ACK'd per RTT
+ * period and increase the window if we managed to fill it.
+ */
+ if (call->peer->rtt_usage == 0)
+ goto out;
+ if (ktime_before(skb->tstamp,
+ ktime_add_ns(call->cong_tstamp,
+ call->peer->rtt)))
+ goto out_no_clear_ca;
+ change = rxrpc_cong_rtt_window_end;
+ call->cong_tstamp = skb->tstamp;
+ if (cumulative_acks >= cwnd)
+ cwnd++;
+ goto out;
+
+ case RXRPC_CALL_PACKET_LOSS:
+ if (summary->nr_nacks == 0)
+ goto resume_normality;
+
+ if (summary->new_low_nack) {
+ change = rxrpc_cong_new_low_nack;
+ call->cong_dup_acks = 1;
+ if (call->cong_extra > 1)
+ call->cong_extra = 1;
+ goto send_extra_data;
+ }
+
+ call->cong_dup_acks++;
+ if (call->cong_dup_acks < 3)
+ goto send_extra_data;
+
+ change = rxrpc_cong_begin_retransmission;
+ call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
+ call->cong_ssthresh = max_t(unsigned int,
+ summary->flight_size / 2, 2);
+ cwnd = call->cong_ssthresh + 3;
+ call->cong_extra = 0;
+ call->cong_dup_acks = 0;
+ resend = true;
+ goto out;
+
+ case RXRPC_CALL_FAST_RETRANSMIT:
+ if (!summary->new_low_nack) {
+ if (summary->nr_new_acks == 0)
+ cwnd += 1;
+ call->cong_dup_acks++;
+ if (call->cong_dup_acks == 2) {
+ change = rxrpc_cong_retransmit_again;
+ call->cong_dup_acks = 0;
+ resend = true;
+ }
+ } else {
+ change = rxrpc_cong_progress;
+ cwnd = call->cong_ssthresh;
+ if (summary->nr_nacks == 0)
+ goto resume_normality;
+ }
+ goto out;
+
+ default:
+ BUG();
+ goto out;
+ }
+
+resume_normality:
+ change = rxrpc_cong_cleared_nacks;
+ call->cong_dup_acks = 0;
+ call->cong_extra = 0;
+ call->cong_tstamp = skb->tstamp;
+ if (cwnd < call->cong_ssthresh)
+ call->cong_mode = RXRPC_CALL_SLOW_START;
+ else
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+out:
+ cumulative_acks = 0;
+out_no_clear_ca:
+ if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
+ cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
+ call->cong_cwnd = cwnd;
+ call->cong_cumul_acks = cumulative_acks;
+ trace_rxrpc_congest(call, summary, acked_serial, change);
+ if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ rxrpc_queue_call(call);
+ return;
+
+packet_loss_detected:
+ change = rxrpc_cong_saw_nack;
+ call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+ call->cong_dup_acks = 0;
+ goto send_extra_data;
+
+send_extra_data:
+ /* Send some previously unsent DATA if we have some to advance the ACK
+ * state.
+ */
+ if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+ RXRPC_TX_ANNO_LAST ||
+ summary->nr_acks != call->tx_top - call->tx_hard_ack) {
+ call->cong_extra++;
+ wake_up(&call->waitq);
+ }
+ goto out_no_clear_ca;
+}
+
+/*
+ * Ping the other end to fill our RTT cache and to retrieve the rwind
+ * and MTU parameters.
+ */
+static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
+ int skew)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ ktime_t now = skb->tstamp;
+
+ if (call->peer->rtt_usage < 3 ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+ true, true,
+ rxrpc_propose_ack_ping_for_params);
+}
+
+/*
* Apply a hard ACK by advancing the Tx window.
*/
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
+ struct rxrpc_ack_summary *summary)
{
struct sk_buff *skb, *list = NULL;
int ix;
+ u8 annotation;
+
+ if (call->acks_lowest_nak == call->tx_hard_ack) {
+ call->acks_lowest_nak = to;
+ } else if (before_eq(call->acks_lowest_nak, to)) {
+ summary->new_low_nack = true;
+ call->acks_lowest_nak = to;
+ }
spin_lock(&call->lock);
@@ -50,22 +236,31 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
call->tx_hard_ack++;
ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
- rxrpc_see_skb(skb);
+ annotation = call->rxtx_annotations[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
call->rxtx_buffer[ix] = NULL;
call->rxtx_annotations[ix] = 0;
skb->next = list;
list = skb;
+
+ if (annotation & RXRPC_TX_ANNO_LAST)
+ set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+ if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
+ summary->nr_rot_new_acks++;
}
spin_unlock(&call->lock);
+ trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
+ rxrpc_transmit_rotate_last :
+ rxrpc_transmit_rotate));
wake_up(&call->waitq);
while (list) {
skb = list;
list = skb->next;
skb->next = NULL;
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
}
}
@@ -75,40 +270,78 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to)
* This occurs when we get an ACKALL packet, the first DATA packet of a reply,
* or a final ACK packet.
*/
-static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
+ const char *abort_why)
{
- _enter("");
- switch (call->state) {
- case RXRPC_CALL_CLIENT_RECV_REPLY:
- return true;
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- case RXRPC_CALL_SERVER_AWAIT_ACK:
- break;
- default:
- rxrpc_proto_abort(abort_why, call, call->tx_top);
- return false;
- }
-
- rxrpc_rotate_tx_window(call, call->tx_top);
+ ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
write_lock(&call->state_lock);
switch (call->state) {
- default:
- break;
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ if (reply_begun)
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ else
+ call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
break;
+
case RXRPC_CALL_SERVER_AWAIT_ACK:
__rxrpc_call_completed(call);
rxrpc_notify_socket(call);
break;
+
+ default:
+ goto bad_state;
}
write_unlock(&call->state_lock);
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true,
+ rxrpc_propose_ack_client_tx_end);
+ trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
+ } else {
+ trace_rxrpc_transmit(call, rxrpc_transmit_end);
+ }
_leave(" = ok");
return true;
+
+bad_state:
+ write_unlock(&call->state_lock);
+ kdebug("end_tx %s", rxrpc_call_states[call->state]);
+ rxrpc_proto_abort(abort_why, call, call->tx_top);
+ return false;
+}
+
+/*
+ * Begin the reply reception phase of a call.
+ */
+static bool rxrpc_receiving_reply(struct rxrpc_call *call)
+{
+ struct rxrpc_ack_summary summary = { 0 };
+ rxrpc_seq_t top = READ_ONCE(call->tx_top);
+
+ if (call->ackr_reason) {
+ spin_lock_bh(&call->lock);
+ call->ackr_reason = 0;
+ call->resend_at = call->expire_at;
+ call->ack_at = call->expire_at;
+ spin_unlock_bh(&call->lock);
+ rxrpc_set_timer(call, rxrpc_timer_init_for_reply,
+ ktime_get_real());
+ }
+
+ if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ rxrpc_rotate_tx_window(call, top, &summary);
+ if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+ rxrpc_proto_abort("TXL", call, top);
+ return false;
+ }
+ if (!rxrpc_end_tx_phase(call, true, "ETD"))
+ return false;
+ call->tx_phase = false;
+ return true;
}
/*
@@ -126,7 +359,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
static bool rxrpc_validate_jumbo(struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int offset = sp->offset;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len = skb->len;
int nr_jumbo = 1;
u8 flags = sp->hdr.flags;
@@ -187,7 +420,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
u16 skew)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int offset = sp->offset;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int ix;
rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
@@ -207,8 +440,9 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
/* Received data implicitly ACKs all of the request packets we sent
* when we're acting as a client.
*/
- if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY &&
- !rxrpc_end_tx_phase(call, "ETD"))
+ if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
+ !rxrpc_receiving_reply(call))
return;
call->ackr_prev_seq = seq;
@@ -238,7 +472,7 @@ next_subpacket:
len = RXRPC_JUMBO_DATALEN;
if (flags & RXRPC_LAST_PACKET) {
- if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
seq != call->rx_top)
return rxrpc_proto_abort("LSN", call, seq);
} else {
@@ -276,12 +510,26 @@ next_subpacket:
* Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
* and also rxrpc_fill_out_ack().
*/
- rxrpc_get_skb(skb);
+ rxrpc_get_skb(skb, rxrpc_skb_rx_got);
call->rxtx_annotations[ix] = annotation;
smp_wmb();
call->rxtx_buffer[ix] = skb;
- if (after(seq, call->rx_top))
+ if (after(seq, call->rx_top)) {
smp_store_release(&call->rx_top, seq);
+ } else if (before(seq, call->rx_top)) {
+ /* Send an immediate ACK if we fill in a hole */
+ if (!ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
+ immediate_ack = true;
+ }
+ if (flags & RXRPC_LAST_PACKET) {
+ set_bit(RXRPC_CALL_RX_LAST, &call->flags);
+ trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
+ } else {
+ trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
+ }
queued = true;
if (after_eq(seq, call->rx_expect_next)) {
@@ -326,7 +574,8 @@ skip:
ack:
if (ack)
rxrpc_propose_ACK(call, ack, skew, ack_serial,
- immediate_ack, true);
+ immediate_ack, true,
+ rxrpc_propose_ack_input_data);
if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
rxrpc_notify_socket(call);
@@ -334,6 +583,64 @@ ack:
}
/*
+ * Process a requested ACK.
+ */
+static void rxrpc_input_requested_ack(struct rxrpc_call *call,
+ ktime_t resp_time,
+ rxrpc_serial_t orig_serial,
+ rxrpc_serial_t ack_serial)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ ktime_t sent_at;
+ int ix;
+
+ for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) {
+ skb = call->rxtx_buffer[ix];
+ if (!skb)
+ continue;
+
+ sp = rxrpc_skb(skb);
+ if (sp->hdr.serial != orig_serial)
+ continue;
+ smp_rmb();
+ sent_at = skb->tstamp;
+ goto found;
+ }
+ return;
+
+found:
+ rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack,
+ orig_serial, ack_serial, sent_at, resp_time);
+}
+
+/*
+ * Process a ping response.
+ */
+static void rxrpc_input_ping_response(struct rxrpc_call *call,
+ ktime_t resp_time,
+ rxrpc_serial_t orig_serial,
+ rxrpc_serial_t ack_serial)
+{
+ rxrpc_serial_t ping_serial;
+ ktime_t ping_time;
+
+ ping_time = call->ackr_ping_time;
+ smp_rmb();
+ ping_serial = call->ackr_ping;
+
+ if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
+ before(orig_serial, ping_serial))
+ return;
+ clear_bit(RXRPC_CALL_PINGING, &call->flags);
+ if (after(orig_serial, ping_serial))
+ return;
+
+ rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response,
+ orig_serial, ack_serial, ping_time, resp_time);
+}
+
+/*
* Process the extra information that may be appended to an ACK packet
*/
static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
@@ -352,6 +659,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
rwind = RXRPC_RXTX_BUFF_SIZE - 1;
call->tx_winsize = rwind;
+ if (call->cong_ssthresh > rwind)
+ call->cong_ssthresh = rwind;
mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
@@ -375,31 +684,45 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
* the time the ACK was sent.
*/
static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
- rxrpc_seq_t seq, int nr_acks)
+ rxrpc_seq_t seq, int nr_acks,
+ struct rxrpc_ack_summary *summary)
{
- bool resend = false;
int ix;
+ u8 annotation, anno_type;
for (; nr_acks > 0; nr_acks--, seq++) {
ix = seq & RXRPC_RXTX_BUFF_MASK;
- switch (*acks) {
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ switch (*acks++) {
case RXRPC_ACK_TYPE_ACK:
- call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK;
+ summary->nr_acks++;
+ if (anno_type == RXRPC_TX_ANNO_ACK)
+ continue;
+ summary->nr_new_acks++;
+ call->rxtx_annotations[ix] =
+ RXRPC_TX_ANNO_ACK | annotation;
break;
case RXRPC_ACK_TYPE_NACK:
- if (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK)
+ if (!summary->nr_nacks &&
+ call->acks_lowest_nak != seq) {
+ call->acks_lowest_nak = seq;
+ summary->new_low_nack = true;
+ }
+ summary->nr_nacks++;
+ if (anno_type == RXRPC_TX_ANNO_NAK)
continue;
- call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK;
- resend = true;
+ summary->nr_new_nacks++;
+ if (anno_type == RXRPC_TX_ANNO_RETRANS)
+ continue;
+ call->rxtx_annotations[ix] =
+ RXRPC_TX_ANNO_NAK | annotation;
break;
default:
return rxrpc_proto_abort("SFT", call, 0);
}
}
-
- if (resend &&
- !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
- rxrpc_queue_call(call);
}
/*
@@ -415,48 +738,65 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
u16 skew)
{
+ struct rxrpc_ack_summary summary = { 0 };
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
union {
struct rxrpc_ackpacket ack;
struct rxrpc_ackinfo info;
u8 acks[RXRPC_MAXACKS];
} buf;
+ rxrpc_serial_t acked_serial;
rxrpc_seq_t first_soft_ack, hard_ack;
- int nr_acks, offset;
+ int nr_acks, offset, ioffset;
_enter("");
- if (skb_copy_bits(skb, sp->offset, &buf.ack, sizeof(buf.ack)) < 0) {
+ offset = sizeof(struct rxrpc_wire_header);
+ if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) {
_debug("extraction failure");
return rxrpc_proto_abort("XAK", call, 0);
}
- sp->offset += sizeof(buf.ack);
+ offset += sizeof(buf.ack);
+ acked_serial = ntohl(buf.ack.serial);
first_soft_ack = ntohl(buf.ack.firstPacket);
hard_ack = first_soft_ack - 1;
nr_acks = buf.ack.nAcks;
+ summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
+ buf.ack.reason : RXRPC_ACK__INVALID);
+
+ trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
sp->hdr.serial,
ntohs(buf.ack.maxSkew),
first_soft_ack,
ntohl(buf.ack.previousPacket),
- ntohl(buf.ack.serial),
- rxrpc_acks(buf.ack.reason),
+ acked_serial,
+ rxrpc_ack_names[summary.ack_reason],
buf.ack.nAcks);
+ if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
+ rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
+ sp->hdr.serial);
+ if (buf.ack.reason == RXRPC_ACK_REQUESTED)
+ rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
+ sp->hdr.serial);
+
if (buf.ack.reason == RXRPC_ACK_PING) {
_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
- skew, sp->hdr.serial, true, true);
+ skew, sp->hdr.serial, true, true,
+ rxrpc_propose_ack_respond_to_ping);
} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
- skew, sp->hdr.serial, true, true);
+ skew, sp->hdr.serial, true, true,
+ rxrpc_propose_ack_respond_to_ack);
}
- offset = sp->offset + nr_acks + 3;
- if (skb->len >= offset + sizeof(buf.info)) {
- if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0)
+ ioffset = offset + nr_acks + 3;
+ if (skb->len >= ioffset + sizeof(buf.info)) {
+ if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
return rxrpc_proto_abort("XAI", call, 0);
rxrpc_input_ackinfo(call, skb, &buf.info);
}
@@ -476,34 +816,43 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
}
/* Discard any out-of-order or duplicate ACKs. */
- if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) {
+ if (before_eq(sp->hdr.serial, call->acks_latest)) {
_debug("discard ACK %d <= %d",
sp->hdr.serial, call->acks_latest);
return;
}
+ call->acks_latest_ts = skb->tstamp;
call->acks_latest = sp->hdr.serial;
- if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
- hard_ack == call->tx_top) {
- rxrpc_end_tx_phase(call, "ETA");
- return;
- }
-
if (before(hard_ack, call->tx_hard_ack) ||
after(hard_ack, call->tx_top))
return rxrpc_proto_abort("AKW", call, 0);
+ if (nr_acks > call->tx_top - hard_ack)
+ return rxrpc_proto_abort("AKN", call, 0);
if (after(hard_ack, call->tx_hard_ack))
- rxrpc_rotate_tx_window(call, hard_ack);
+ rxrpc_rotate_tx_window(call, hard_ack, &summary);
+
+ if (nr_acks > 0) {
+ if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0)
+ return rxrpc_proto_abort("XSA", call, 0);
+ rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
+ &summary);
+ }
- if (after(first_soft_ack, call->tx_top))
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+ rxrpc_end_tx_phase(call, false, "ETA");
return;
+ }
+
+ if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+ RXRPC_TX_ANNO_LAST &&
+ summary.nr_acks == call->tx_top - hard_ack)
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+ false, true,
+ rxrpc_propose_ack_ping_for_lost_reply);
- if (nr_acks > call->tx_top - first_soft_ack + 1)
- nr_acks = first_soft_ack - call->tx_top + 1;
- if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
- return rxrpc_proto_abort("XSA", call, 0);
- rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks);
+ return rxrpc_congestion_management(call, skb, &summary, acked_serial);
}
/*
@@ -511,11 +860,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
*/
static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
{
+ struct rxrpc_ack_summary summary = { 0 };
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
_proto("Rx ACKALL %%%u", sp->hdr.serial);
- rxrpc_end_tx_phase(call, "ETL");
+ rxrpc_rotate_tx_window(call, call->tx_top, &summary);
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ rxrpc_end_tx_phase(call, false, "ETL");
}
/*
@@ -530,7 +882,8 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
_enter("");
if (skb->len >= 4 &&
- skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) >= 0)
+ skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &wtmp, sizeof(wtmp)) >= 0)
abort_code = ntohl(wtmp);
_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
@@ -646,7 +999,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
sp->hdr.securityIndex = whdr.securityIndex;
sp->hdr._rsvd = ntohs(whdr._rsvd);
sp->hdr.serviceId = ntohs(whdr.serviceId);
- sp->offset = sizeof(whdr);
return 0;
}
@@ -681,13 +1033,13 @@ void rxrpc_data_ready(struct sock *udp_sk)
return;
}
- rxrpc_new_skb(skb);
+ rxrpc_new_skb(skb, rxrpc_skb_rx_received);
_net("recv skb %p", skb);
/* we'll probably need to checksum it (didn't call sock_recvmsg) */
if (skb_checksum_complete(skb)) {
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
_leave(" [CSUM failed]");
return;
@@ -701,12 +1053,19 @@ void rxrpc_data_ready(struct sock *udp_sk)
skb_orphan(skb);
sp = rxrpc_skb(skb);
- _net("Rx UDP packet from %08x:%04hu",
- ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
-
/* dig out the RxRPC connection details */
if (rxrpc_extract_header(sp, skb) < 0)
goto bad_message;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+ if ((lose++ & 7) == 7) {
+ trace_rxrpc_rx_lose(sp);
+ rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
+ return;
+ }
+ }
+
trace_rxrpc_rx_packet(sp);
_net("Rx RxRPC %s ep=%x call=%x:%x",
@@ -803,6 +1162,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
rcu_read_unlock();
goto reject_packet;
}
+ rxrpc_send_ping(call, skb, skew);
}
rxrpc_input_call_packet(call, skb, skew);
@@ -811,7 +1171,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
discard_unlock:
rcu_read_unlock();
discard:
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
out:
trace_rxrpc_rx_done(0, 0);
return;
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index f073e93..540d395 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -90,12 +90,13 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
if (skb) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_VERSION:
- if (skb_copy_bits(skb, sp->offset, &v, 1) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &v, 1) < 0)
return;
_proto("Rx VERSION { %02x }", v);
if (v == 0)
@@ -107,7 +108,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
break;
}
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
}
_leave("");
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index e3fad80..ff4864d5 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -86,7 +86,6 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
atomic_set(&local->usage, 1);
INIT_LIST_HEAD(&local->link);
INIT_WORK(&local->processor, rxrpc_local_processor);
- INIT_HLIST_HEAD(&local->services);
init_rwsem(&local->defrag_sem);
skb_queue_head_init(&local->reject_queue);
skb_queue_head_init(&local->event_queue);
@@ -292,7 +291,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
mutex_unlock(&rxrpc_local_mutex);
ASSERT(RB_EMPTY_ROOT(&local->client_conns));
- ASSERT(hlist_empty(&local->services));
+ ASSERT(!local->service);
if (socket) {
local->socket = NULL;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 8b91078..9d1c721 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -21,28 +21,33 @@
unsigned int rxrpc_max_backlog __read_mostly = 10;
/*
+ * Maximum lifetime of a call (in mx).
+ */
+unsigned int rxrpc_max_call_lifetime = 60 * 1000;
+
+/*
* How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in jiffies).
+ * packet with RXRPC_REQUEST_ACK set (in ms).
*/
unsigned int rxrpc_requested_ack_delay = 1;
/*
- * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
+ * How long to wait before scheduling an ACK with subtype DELAY (in ms).
*
* We use this when we've received new data packets. If those packets aren't
* all consumed within this time we will send a DELAY ACK if an ACK was not
* requested to let the sender know it doesn't need to resend.
*/
-unsigned int rxrpc_soft_ack_delay = 1 * HZ;
+unsigned int rxrpc_soft_ack_delay = 1 * 1000;
/*
- * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
+ * How long to wait before scheduling an ACK with subtype IDLE (in ms).
*
* We use this when we've consumed some previously soft-ACK'd packets when
* further packets aren't immediately received to decide when to send an IDLE
* ACK let the other end know that it can free up its Tx buffer space.
*/
-unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
+unsigned int rxrpc_idle_ack_delay = 0.5 * 1000;
/*
* Receive window size in packets. This indicates the maximum number of
@@ -68,9 +73,9 @@ unsigned int rxrpc_rx_mtu = 5692;
unsigned int rxrpc_rx_jumbo_max = 4;
/*
- * Time till packet resend (in jiffies).
+ * Time till packet resend (in milliseconds).
*/
-unsigned int rxrpc_resend_timeout = 4 * HZ;
+unsigned int rxrpc_resend_timeout = 4 * 1000;
const char *const rxrpc_pkts[] = {
"?00",
@@ -83,21 +88,152 @@ const s8 rxrpc_ack_priority[] = {
[RXRPC_ACK_DELAY] = 1,
[RXRPC_ACK_REQUESTED] = 2,
[RXRPC_ACK_IDLE] = 3,
- [RXRPC_ACK_PING_RESPONSE] = 4,
- [RXRPC_ACK_DUPLICATE] = 5,
- [RXRPC_ACK_OUT_OF_SEQUENCE] = 6,
- [RXRPC_ACK_EXCEEDS_WINDOW] = 7,
- [RXRPC_ACK_NOSPACE] = 8,
-};
-
-const char *rxrpc_acks(u8 reason)
-{
- static const char *const str[] = {
- "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
- "IDL", "-?-"
- };
-
- if (reason >= ARRAY_SIZE(str))
- reason = ARRAY_SIZE(str) - 1;
- return str[reason];
-}
+ [RXRPC_ACK_DUPLICATE] = 4,
+ [RXRPC_ACK_OUT_OF_SEQUENCE] = 5,
+ [RXRPC_ACK_EXCEEDS_WINDOW] = 6,
+ [RXRPC_ACK_NOSPACE] = 7,
+ [RXRPC_ACK_PING_RESPONSE] = 8,
+ [RXRPC_ACK_PING] = 9,
+};
+
+const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
+ "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+ "IDL", "-?-"
+};
+
+const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
+ [rxrpc_skb_rx_cleaned] = "Rx CLN",
+ [rxrpc_skb_rx_freed] = "Rx FRE",
+ [rxrpc_skb_rx_got] = "Rx GOT",
+ [rxrpc_skb_rx_lost] = "Rx *L*",
+ [rxrpc_skb_rx_received] = "Rx RCV",
+ [rxrpc_skb_rx_purged] = "Rx PUR",
+ [rxrpc_skb_rx_rotated] = "Rx ROT",
+ [rxrpc_skb_rx_seen] = "Rx SEE",
+ [rxrpc_skb_tx_cleaned] = "Tx CLN",
+ [rxrpc_skb_tx_freed] = "Tx FRE",
+ [rxrpc_skb_tx_got] = "Tx GOT",
+ [rxrpc_skb_tx_new] = "Tx NEW",
+ [rxrpc_skb_tx_rotated] = "Tx ROT",
+ [rxrpc_skb_tx_seen] = "Tx SEE",
+};
+
+const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
+ [rxrpc_conn_new_client] = "NWc",
+ [rxrpc_conn_new_service] = "NWs",
+ [rxrpc_conn_queued] = "QUE",
+ [rxrpc_conn_seen] = "SEE",
+ [rxrpc_conn_got] = "GOT",
+ [rxrpc_conn_put_client] = "PTc",
+ [rxrpc_conn_put_service] = "PTs",
+};
+
+const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
+ [rxrpc_client_activate_chans] = "Activa",
+ [rxrpc_client_alloc] = "Alloc ",
+ [rxrpc_client_chan_activate] = "ChActv",
+ [rxrpc_client_chan_disconnect] = "ChDisc",
+ [rxrpc_client_chan_pass] = "ChPass",
+ [rxrpc_client_chan_unstarted] = "ChUnst",
+ [rxrpc_client_cleanup] = "Clean ",
+ [rxrpc_client_count] = "Count ",
+ [rxrpc_client_discard] = "Discar",
+ [rxrpc_client_duplicate] = "Duplic",
+ [rxrpc_client_exposed] = "Expose",
+ [rxrpc_client_replace] = "Replac",
+ [rxrpc_client_to_active] = "->Actv",
+ [rxrpc_client_to_culled] = "->Cull",
+ [rxrpc_client_to_idle] = "->Idle",
+ [rxrpc_client_to_inactive] = "->Inac",
+ [rxrpc_client_to_waiting] = "->Wait",
+ [rxrpc_client_uncount] = "Uncoun",
+};
+
+const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
+ [rxrpc_transmit_wait] = "WAI",
+ [rxrpc_transmit_queue] = "QUE",
+ [rxrpc_transmit_queue_last] = "QLS",
+ [rxrpc_transmit_rotate] = "ROT",
+ [rxrpc_transmit_rotate_last] = "RLS",
+ [rxrpc_transmit_await_reply] = "AWR",
+ [rxrpc_transmit_end] = "END",
+};
+
+const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
+ [rxrpc_receive_incoming] = "INC",
+ [rxrpc_receive_queue] = "QUE",
+ [rxrpc_receive_queue_last] = "QLS",
+ [rxrpc_receive_front] = "FRN",
+ [rxrpc_receive_rotate] = "ROT",
+ [rxrpc_receive_end] = "END",
+};
+
+const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
+ [rxrpc_recvmsg_enter] = "ENTR",
+ [rxrpc_recvmsg_wait] = "WAIT",
+ [rxrpc_recvmsg_dequeue] = "DEQU",
+ [rxrpc_recvmsg_hole] = "HOLE",
+ [rxrpc_recvmsg_next] = "NEXT",
+ [rxrpc_recvmsg_cont] = "CONT",
+ [rxrpc_recvmsg_full] = "FULL",
+ [rxrpc_recvmsg_data_return] = "DATA",
+ [rxrpc_recvmsg_terminal] = "TERM",
+ [rxrpc_recvmsg_to_be_accepted] = "TBAC",
+ [rxrpc_recvmsg_return] = "RETN",
+};
+
+const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
+ [rxrpc_rtt_tx_ping] = "PING",
+ [rxrpc_rtt_tx_data] = "DATA",
+};
+
+const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
+ [rxrpc_rtt_rx_ping_response] = "PONG",
+ [rxrpc_rtt_rx_requested_ack] = "RACK",
+};
+
+const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
+ [rxrpc_timer_begin] = "Begin ",
+ [rxrpc_timer_expired] = "*EXPR*",
+ [rxrpc_timer_init_for_reply] = "IniRpl",
+ [rxrpc_timer_set_for_ack] = "SetAck",
+ [rxrpc_timer_set_for_send] = "SetTx ",
+ [rxrpc_timer_set_for_resend] = "SetRTx",
+};
+
+const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
+ [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
+ [rxrpc_propose_ack_input_data] = "DataIn ",
+ [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
+ [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
+ [rxrpc_propose_ack_ping_for_params] = "Params ",
+ [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
+ [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
+ [rxrpc_propose_ack_retry_tx] = "RetryTx",
+ [rxrpc_propose_ack_rotate_rx] = "RxAck ",
+ [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
+};
+
+const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
+ [rxrpc_propose_ack_use] = "",
+ [rxrpc_propose_ack_update] = " Update",
+ [rxrpc_propose_ack_subsume] = " Subsume",
+};
+
+const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
+ [RXRPC_CALL_SLOW_START] = "SlowStart",
+ [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
+ [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
+ [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
+};
+
+const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
+ [rxrpc_cong_begin_retransmission] = " Retrans",
+ [rxrpc_cong_cleared_nacks] = " Cleared",
+ [rxrpc_cong_new_low_nack] = " NewLowN",
+ [rxrpc_cong_no_change] = "",
+ [rxrpc_cong_progress] = " Progres",
+ [rxrpc_cong_retransmit_again] = " ReTxAgn",
+ [rxrpc_cong_rtt_window_end] = " RttWinE",
+ [rxrpc_cong_saw_nack] = " SawNack",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 06a9aca..0d47db8 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -36,25 +36,34 @@ struct rxrpc_pkt_buffer {
* Fill out an ACK packet.
*/
static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
- struct rxrpc_pkt_buffer *pkt)
+ struct rxrpc_pkt_buffer *pkt,
+ rxrpc_seq_t *_hard_ack,
+ rxrpc_seq_t *_top)
{
+ rxrpc_serial_t serial;
rxrpc_seq_t hard_ack, top, seq;
int ix;
u32 mtu, jmax;
u8 *ackp = pkt->acks;
/* Barrier against rxrpc_input_data(). */
+ serial = call->ackr_serial;
hard_ack = READ_ONCE(call->rx_hard_ack);
top = smp_load_acquire(&call->rx_top);
+ *_hard_ack = hard_ack;
+ *_top = top;
pkt->ack.bufferSpace = htons(8);
pkt->ack.maxSkew = htons(call->ackr_skew);
pkt->ack.firstPacket = htonl(hard_ack + 1);
pkt->ack.previousPacket = htonl(call->ackr_prev_seq);
- pkt->ack.serial = htonl(call->ackr_serial);
+ pkt->ack.serial = htonl(serial);
pkt->ack.reason = call->ackr_reason;
pkt->ack.nAcks = top - hard_ack;
+ if (pkt->ack.reason == RXRPC_ACK_PING)
+ pkt->whdr.flags |= RXRPC_REQUEST_ACK;
+
if (after(top, hard_ack)) {
seq = hard_ack + 1;
do {
@@ -91,7 +100,9 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
struct msghdr msg;
struct kvec iov[2];
rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top;
size_t len, n;
+ bool ping = false;
int ioc, ret;
u32 abort_code;
@@ -110,8 +121,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
return -ENOMEM;
}
- serial = atomic_inc_return(&conn->serial);
-
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
@@ -122,7 +131,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
pkt->whdr.cid = htonl(call->cid);
pkt->whdr.callNumber = htonl(call->call_id);
pkt->whdr.seq = 0;
- pkt->whdr.serial = htonl(serial);
pkt->whdr.type = type;
pkt->whdr.flags = conn->out_clientflag;
pkt->whdr.userStatus = 0;
@@ -137,19 +145,19 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
switch (type) {
case RXRPC_PACKET_TYPE_ACK:
spin_lock_bh(&call->lock);
- n = rxrpc_fill_out_ack(call, pkt);
+ if (!call->ackr_reason) {
+ spin_unlock_bh(&call->lock);
+ ret = 0;
+ goto out;
+ }
+ ping = (call->ackr_reason == RXRPC_ACK_PING);
+ n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top);
call->ackr_reason = 0;
spin_unlock_bh(&call->lock);
- _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- serial,
- ntohs(pkt->ack.maxSkew),
- ntohl(pkt->ack.firstPacket),
- ntohl(pkt->ack.previousPacket),
- ntohl(pkt->ack.serial),
- rxrpc_acks(pkt->ack.reason),
- pkt->ack.nAcks);
+
+ pkt->whdr.flags |= RXRPC_SLOW_START_OK;
iov[0].iov_len += sizeof(pkt->ack) + n;
iov[1].iov_base = &pkt->ackinfo;
@@ -161,7 +169,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
case RXRPC_PACKET_TYPE_ABORT:
abort_code = call->abort_code;
pkt->abort_code = htonl(abort_code);
- _proto("Tx ABORT %%%u { %d }", serial, abort_code);
iov[0].iov_len += sizeof(pkt->abort_code);
len += sizeof(pkt->abort_code);
ioc = 1;
@@ -173,19 +180,52 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
goto out;
}
+ serial = atomic_inc_return(&conn->serial);
+ pkt->whdr.serial = htonl(serial);
+ switch (type) {
+ case RXRPC_PACKET_TYPE_ACK:
+ trace_rxrpc_tx_ack(call, serial,
+ ntohl(pkt->ack.firstPacket),
+ ntohl(pkt->ack.serial),
+ pkt->ack.reason, pkt->ack.nAcks);
+ break;
+ }
+
+ if (ping) {
+ call->ackr_ping = serial;
+ smp_wmb();
+ /* We need to stick a time in before we send the packet in case
+ * the reply gets back before kernel_sendmsg() completes - but
+ * asking UDP to send the packet can take a relatively long
+ * time, so we update the time after, on the assumption that
+ * the packet transmission is more likely to happen towards the
+ * end of the kernel_sendmsg() call.
+ */
+ call->ackr_ping_time = ktime_get_real();
+ set_bit(RXRPC_CALL_PINGING, &call->flags);
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
+ }
ret = kernel_sendmsg(conn->params.local->socket,
&msg, iov, ioc, len);
+ if (ping)
+ call->ackr_ping_time = ktime_get_real();
- if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) {
- switch (pkt->whdr.type) {
- case RXRPC_PACKET_TYPE_ACK:
+ if (type == RXRPC_PACKET_TYPE_ACK &&
+ call->state < RXRPC_CALL_COMPLETE) {
+ if (ret < 0) {
+ clear_bit(RXRPC_CALL_PINGING, &call->flags);
rxrpc_propose_ACK(call, pkt->ack.reason,
ntohs(pkt->ack.maxSkew),
ntohl(pkt->ack.serial),
- true, true);
- break;
- case RXRPC_PACKET_TYPE_ABORT:
- break;
+ true, true,
+ rxrpc_propose_ack_retry_tx);
+ } else {
+ spin_lock_bh(&call->lock);
+ if (after(hard_ack, call->ackr_consumed))
+ call->ackr_consumed = hard_ack;
+ if (after(top, call->ackr_seen))
+ call->ackr_seen = top;
+ spin_unlock_bh(&call->lock);
}
}
@@ -198,43 +238,102 @@ out:
/*
* send a packet through the transport endpoint
*/
-int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ bool retrans)
{
- struct kvec iov[1];
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct msghdr msg;
+ struct kvec iov[2];
+ rxrpc_serial_t serial;
+ size_t len;
+ bool lost = false;
int ret, opt;
_enter(",{%d}", skb->len);
- iov[0].iov_base = skb->head;
- iov[0].iov_len = skb->len;
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = atomic_inc_return(&conn->serial);
+
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(call->cid);
+ whdr.callNumber = htonl(call->call_id);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = htonl(serial);
+ whdr.type = RXRPC_PACKET_TYPE_DATA;
+ whdr.flags = sp->hdr.flags;
+ whdr.userStatus = 0;
+ whdr.securityIndex = call->security_ix;
+ whdr._rsvd = htons(sp->hdr._rsvd);
+ whdr.serviceId = htons(call->service_id);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = skb->head;
+ iov[1].iov_len = skb->len;
+ len = iov[0].iov_len + iov[1].iov_len;
- msg.msg_name = &conn->params.peer->srx.transport;
- msg.msg_namelen = conn->params.peer->srx.transport_len;
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- /* send the packet with the don't fragment bit set if we currently
- * think it's small enough */
- if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
- down_read(&conn->params.local->defrag_sem);
- /* send the packet by UDP
- * - returns -EMSGSIZE if UDP would have to fragment the packet
- * to go out of the interface
- * - in which case, we'll have processed the ICMP error
- * message and update the peer record
- */
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ /* If our RTT cache needs working on, request an ACK. Also request
+ * ACKs if a DATA packet appears to have been lost.
+ */
+ if (retrans ||
+ call->cong_mode == RXRPC_CALL_SLOW_START ||
+ (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ ktime_get_real()))
+ whdr.flags |= RXRPC_REQUEST_ACK;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+ if ((lose++ & 7) == 7) {
+ ret = 0;
+ lost = true;
+ goto done;
+ }
+ }
- up_read(&conn->params.local->defrag_sem);
- if (ret == -EMSGSIZE)
- goto send_fragmentable;
+ _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
- _leave(" = %d [%u]", ret, conn->params.peer->maxdata);
- return ret;
+ /* send the packet with the don't fragment bit set if we currently
+ * think it's small enough */
+ if (iov[1].iov_len >= call->peer->maxdata)
+ goto send_fragmentable;
+
+ down_read(&conn->params.local->defrag_sem);
+ /* send the packet by UDP
+ * - returns -EMSGSIZE if UDP would have to fragment the packet
+ * to go out of the interface
+ * - in which case, we'll have processed the ICMP error
+ * message and update the peer record
+ */
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+
+ up_read(&conn->params.local->defrag_sem);
+ if (ret == -EMSGSIZE)
+ goto send_fragmentable;
+
+done:
+ trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
+ retrans, lost);
+ if (ret >= 0) {
+ ktime_t now = ktime_get_real();
+ skb->tstamp = now;
+ smp_wmb();
+ sp->hdr.serial = serial;
+ if (whdr.flags & RXRPC_REQUEST_ACK) {
+ call->peer->rtt_last_req = now;
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
+ }
}
+ _leave(" = %d [%u]", ret, call->peer->maxdata);
+ return ret;
send_fragmentable:
/* attempt to send this message with fragmentation enabled */
@@ -249,8 +348,8 @@ send_fragmentable:
SOL_IP, IP_MTU_DISCOVER,
(char *)&opt, sizeof(opt));
if (ret == 0) {
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg,
+ iov, 2, len);
opt = IP_PMTUDISC_DO;
kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -279,8 +378,7 @@ send_fragmentable:
}
up_write(&conn->params.local->defrag_sem);
- _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
- return ret;
+ goto done;
}
/*
@@ -314,7 +412,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
whdr.type = RXRPC_PACKET_TYPE_ABORT;
while ((skb = skb_dequeue(&local->reject_queue))) {
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
sp = rxrpc_skb(skb);
if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
@@ -333,7 +431,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
kernel_sendmsg(local->socket, &msg, iov, 2, size);
}
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
}
_leave("");
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 9e0725f..bf13b84 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -155,11 +155,11 @@ void rxrpc_error_report(struct sock *sk)
_leave("UDP socket errqueue empty");
return;
}
- rxrpc_new_skb(skb);
+ rxrpc_new_skb(skb, rxrpc_skb_rx_received);
serr = SKB_EXT_ERR(skb);
if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
_leave("UDP empty message");
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
return;
}
@@ -169,7 +169,7 @@ void rxrpc_error_report(struct sock *sk)
peer = NULL;
if (!peer) {
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
_leave(" [no peer]");
return;
}
@@ -179,7 +179,7 @@ void rxrpc_error_report(struct sock *sk)
serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
rxrpc_adjust_mtu(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
rxrpc_put_peer(peer);
_leave(" [MTU update]");
return;
@@ -187,7 +187,7 @@ void rxrpc_error_report(struct sock *sk)
rxrpc_store_error(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
/* The ref we obtained is passed off to the work item */
rxrpc_queue_work(&peer->error_distributor);
@@ -305,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
rxrpc_put_peer(peer);
_leave("");
}
+
+/*
+ * Add RTT information to cache. This is called in softirq mode and has
+ * exclusive access to the peer RTT data.
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+ rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+ ktime_t send_time, ktime_t resp_time)
+{
+ struct rxrpc_peer *peer = call->peer;
+ s64 rtt;
+ u64 sum = peer->rtt_sum, avg;
+ u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
+
+ rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
+ if (rtt < 0)
+ return;
+
+ /* Replace the oldest datum in the RTT buffer */
+ sum -= peer->rtt_cache[cursor];
+ sum += rtt;
+ peer->rtt_cache[cursor] = rtt;
+ peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
+ peer->rtt_sum = sum;
+ if (usage < RXRPC_RTT_CACHE_SIZE) {
+ usage++;
+ peer->rtt_usage = usage;
+ }
+
+ /* Now recalculate the average */
+ if (usage == RXRPC_RTT_CACHE_SIZE) {
+ avg = sum / RXRPC_RTT_CACHE_SIZE;
+ } else {
+ avg = sum;
+ do_div(avg, usage);
+ }
+
+ peer->rtt = avg;
+ trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
+ usage, avg);
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index f3e5766..941b724 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -244,6 +244,7 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
peer->hash_key = hash_key;
rxrpc_assess_MTU_size(peer);
peer->mtu = peer->if_mtu;
+ peer->rtt_last_req = ktime_get_real();
switch (peer->srx.transport.family) {
case AF_INET:
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a284205..f05ea0a 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -94,6 +94,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
break;
}
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
+ call->rx_pkt_offset, call->rx_pkt_len, ret);
return ret;
}
@@ -124,21 +126,24 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
write_unlock(&rx->call_lock);
}
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret);
return ret;
}
/*
* End the packet reception phase.
*/
-static void rxrpc_end_rx_phase(struct rxrpc_call *call)
+static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
{
_enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
+ trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
+ ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
+
if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
- rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false);
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
+ rxrpc_propose_ack_terminal_ack);
rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
- } else {
- rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, false);
}
write_lock_bh(&call->state_lock);
@@ -149,6 +154,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
break;
case RXRPC_CALL_SERVER_RECV_REQUEST:
+ call->tx_phase = true;
call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
break;
default:
@@ -163,8 +169,11 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call)
*/
static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
{
+ struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
+ rxrpc_serial_t serial;
rxrpc_seq_t hard_ack, top;
+ u8 flags;
int ix;
_enter("%d", call->debug_id);
@@ -176,17 +185,35 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
hard_ack++;
ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
+ sp = rxrpc_skb(skb);
+ flags = sp->hdr.flags;
+ serial = sp->hdr.serial;
+ if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
+ serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
+
call->rxtx_buffer[ix] = NULL;
call->rxtx_annotations[ix] = 0;
/* Barrier against rxrpc_input_data(). */
smp_store_release(&call->rx_hard_ack, hard_ack);
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
- _debug("%u,%u,%lx", hard_ack, top, call->flags);
- if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags))
- rxrpc_end_rx_phase(call);
+ _debug("%u,%u,%02x", hard_ack, top, flags);
+ trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
+ if (flags & RXRPC_LAST_PACKET) {
+ rxrpc_end_rx_phase(call, serial);
+ } else {
+ /* Check to see if there's an ACK that needs sending. */
+ if (after_eq(hard_ack, call->ackr_consumed + 2) ||
+ after_eq(top, call->ackr_seen + 2) ||
+ (hard_ack == top && after(hard_ack, call->ackr_consumed)))
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial,
+ true, false,
+ rxrpc_propose_ack_rotate_rx);
+ if (call->ackr_reason)
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+ }
}
/*
@@ -234,18 +261,13 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
u8 *_annotation,
unsigned int *_offset, unsigned int *_len)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int offset = *_offset;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len = *_len;
int ret;
u8 annotation = *_annotation;
- if (offset > 0)
- return 0;
-
/* Locate the subpacket */
- offset = sp->offset;
- len = skb->len - sp->offset;
+ len = skb->len - offset;
if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
RXRPC_JUMBO_SUBPKTLEN);
@@ -281,32 +303,53 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
size_t remain;
bool last;
unsigned int rx_pkt_offset, rx_pkt_len;
- int ix, copy, ret = 0;
-
- _enter("");
+ int ix, copy, ret = -EAGAIN, ret2;
rx_pkt_offset = call->rx_pkt_offset;
rx_pkt_len = call->rx_pkt_len;
+ if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
+ seq = call->rx_hard_ack;
+ ret = 1;
+ goto done;
+ }
+
/* Barriers against rxrpc_input_data(). */
hard_ack = call->rx_hard_ack;
top = smp_load_acquire(&call->rx_top);
for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
ix = seq & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
- if (!skb)
+ if (!skb) {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
break;
+ }
smp_rmb();
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
sp = rxrpc_skb(skb);
+ if (!(flags & MSG_PEEK))
+ trace_rxrpc_receive(call, rxrpc_receive_front,
+ sp->hdr.serial, seq);
+
if (msg)
sock_recv_timestamp(msg, sock->sk, skb);
- ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix],
- &rx_pkt_offset, &rx_pkt_len);
- _debug("recvmsg %x DATA #%u { %d, %d }",
- sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len);
+ if (rx_pkt_offset == 0) {
+ ret2 = rxrpc_locate_data(call, skb,
+ &call->rxtx_annotations[ix],
+ &rx_pkt_offset, &rx_pkt_len);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
+ rx_pkt_offset, rx_pkt_len, ret2);
+ if (ret2 < 0) {
+ ret = ret2;
+ goto out;
+ }
+ } else {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
+ }
/* We have to handle short, empty and used-up DATA packets. */
remain = len - *_offset;
@@ -314,22 +357,24 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (copy > remain)
copy = remain;
if (copy > 0) {
- ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
- copy);
- if (ret < 0)
+ ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+ copy);
+ if (ret2 < 0) {
+ ret = ret2;
goto out;
+ }
/* handle piecemeal consumption of data packets */
- _debug("copied %d @%zu", copy, *_offset);
-
rx_pkt_offset += copy;
rx_pkt_len -= copy;
*_offset += copy;
}
if (rx_pkt_len > 0) {
- _debug("buffer full");
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
ASSERTCMP(*_offset, ==, len);
+ ret = 0;
break;
}
@@ -340,20 +385,21 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
rx_pkt_offset = 0;
rx_pkt_len = 0;
- ASSERTIFCMP(last, seq, ==, top);
- }
-
- if (after(seq, top)) {
- ret = -EAGAIN;
- if (test_bit(RXRPC_CALL_RX_LAST, &call->flags))
+ if (last) {
+ ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
ret = 1;
+ goto out;
+ }
}
+
out:
if (!(flags & MSG_PEEK)) {
call->rx_pkt_offset = rx_pkt_offset;
call->rx_pkt_len = rx_pkt_len;
}
- _leave(" = %d [%u/%u]", ret, seq, top);
+done:
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
+ rx_pkt_offset, rx_pkt_len, ret);
return ret;
}
@@ -374,7 +420,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
DEFINE_WAIT(wait);
- _enter(",,,%zu,%d", len, flags);
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
if (flags & (MSG_OOB | MSG_TRUNC))
return -EOPNOTSUPP;
@@ -394,8 +440,10 @@ try_again:
if (list_empty(&rx->recvmsg_q)) {
ret = -EWOULDBLOCK;
- if (timeo == 0)
+ if (timeo == 0) {
+ call = NULL;
goto error_no_call;
+ }
release_sock(&rx->sk);
@@ -409,6 +457,8 @@ try_again:
if (list_empty(&rx->recvmsg_q)) {
if (signal_pending(current))
goto wait_interrupted;
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
+ 0, 0, 0, 0);
timeo = schedule_timeout(timeo);
}
finish_wait(sk_sleep(&rx->sk), &wait);
@@ -427,7 +477,7 @@ try_again:
rxrpc_get_call(call, rxrpc_call_got);
write_unlock_bh(&rx->recvmsg_lock);
- _debug("recvmsg call %p", call);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
@@ -497,16 +547,15 @@ error:
rxrpc_put_call(call, rxrpc_call_put);
error_no_call:
release_sock(&rx->sk);
- _leave(" = %d", ret);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
return ret;
wait_interrupted:
ret = sock_intr_errno(timeo);
wait_error:
finish_wait(sk_sleep(&rx->sk), &wait);
- release_sock(&rx->sk);
- _leave(" = %d [wait]", ret);
- return ret;
+ call = NULL;
+ goto error_no_call;
}
/**
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index ae39255..627abed 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
case RXRPC_SECURITY_AUTH:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level1_hdr);
- conn->header_size += sizeof(struct rxkad_level1_hdr);
break;
case RXRPC_SECURITY_ENCRYPT:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level2_hdr);
- conn->header_size += sizeof(struct rxkad_level2_hdr);
break;
default:
ret = -EKEYREJECTED;
@@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
data_size |= (u32)check << 16;
hdr.data_size = htonl(data_size);
@@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
@@ -277,7 +275,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
/* calculate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
x |= sp->hdr.seq & 0x3fffffff;
- call->crypto_buf[0] = htonl(sp->hdr.callNumber);
+ call->crypto_buf[0] = htonl(call->call_id);
call->crypto_buf[1] = htonl(x);
sg_init_one(&sg, call->crypto_buf, 8);
@@ -773,7 +771,8 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
}
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, sp->offset, &challenge, sizeof(challenge)) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &challenge, sizeof(challenge)) < 0)
goto protocol_error;
version = ntohl(challenge.version);
@@ -1030,7 +1029,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, sp->offset, &response, sizeof(response)) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ &response, sizeof(response)) < 0)
goto protocol_error;
if (!pskb_pull(skb, sizeof(response)))
BUG();
@@ -1059,7 +1059,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
return -ENOMEM;
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, sp->offset, ticket, ticket_len) < 0)
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ ticket, ticket_len) < 0)
goto protocol_error_free;
ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 82d8134..7d921e5 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -131,10 +131,10 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
/* find the service */
read_lock(&local->services_lock);
- hlist_for_each_entry(rx, &local->services, listen_link) {
- if (rx->srx.srx_service == conn->params.service_id)
- goto found_service;
- }
+ rx = rcu_dereference_protected(local->service,
+ lockdep_is_held(&local->services_lock));
+ if (rx && rx->srx.srx_service == conn->params.service_id)
+ goto found_service;
/* the service appears to have died */
read_unlock(&local->services_lock);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index cba2365..3322543 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -45,7 +45,9 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
ret = 0;
- if (call->tx_top - call->tx_hard_ack < call->tx_winsize)
+ if (call->tx_top - call->tx_hard_ack <
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra))
break;
if (call->state >= RXRPC_CALL_COMPLETE) {
ret = -call->error;
@@ -56,6 +58,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
break;
}
+ trace_rxrpc_transmit(call, rxrpc_transmit_wait);
release_sock(&rx->sk);
*timeo = schedule_timeout(*timeo);
lock_sock(&rx->sk);
@@ -93,19 +96,30 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
rxrpc_seq_t seq = sp->hdr.seq;
int ret, ix;
+ u8 annotation = RXRPC_TX_ANNO_UNACK;
_net("queue skb %p [%d]", skb, seq);
ASSERTCMP(seq, ==, call->tx_top + 1);
+ if (last)
+ annotation |= RXRPC_TX_ANNO_LAST;
+
+ /* We have to set the timestamp before queueing as the retransmit
+ * algorithm can see the packet as soon as we queue it.
+ */
+ skb->tstamp = ktime_get_real();
+
ix = seq & RXRPC_RXTX_BUFF_MASK;
- rxrpc_get_skb(skb);
- call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;
+ rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ call->rxtx_annotations[ix] = annotation;
smp_wmb();
call->rxtx_buffer[ix] = skb;
call->tx_top = seq;
if (last)
- set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+ trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+ else
+ trace_rxrpc_transmit(call, rxrpc_transmit_queue);
if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
_debug("________awaiting reply/ACK__________");
@@ -127,43 +141,26 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
write_unlock_bh(&call->state_lock);
}
- _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
if (seq == 1 && rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
- sp->resend_at = jiffies + rxrpc_resend_timeout;
- ret = rxrpc_send_data_packet(call->conn, skb);
+ ret = rxrpc_send_data_packet(call, skb, false);
if (ret < 0) {
_debug("need instant resend %d", ret);
rxrpc_instant_resend(call, ix);
- }
+ } else {
+ ktime_t now = ktime_get_real(), resend_at;
- rxrpc_free_skb(skb);
- _leave("");
-}
+ resend_at = ktime_add_ms(now, rxrpc_resend_timeout);
-/*
- * Convert a host-endian header into a network-endian header.
- */
-static void rxrpc_insert_header(struct sk_buff *skb)
-{
- struct rxrpc_wire_header whdr;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ if (ktime_before(resend_at, call->resend_at)) {
+ call->resend_at = resend_at;
+ rxrpc_set_timer(call, rxrpc_timer_set_for_send, now);
+ }
+ }
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = htonl(sp->hdr.serial);
- whdr.type = sp->hdr.type;
- whdr.flags = sp->hdr.flags;
- whdr.userStatus = sp->hdr.userStatus;
- whdr.securityIndex = sp->hdr.securityIndex;
- whdr._rsvd = htons(sp->hdr._rsvd);
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- memcpy(skb->head, &whdr, sizeof(whdr));
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ _leave("");
}
/*
@@ -194,17 +191,22 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
skb = call->tx_pending;
call->tx_pending = NULL;
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
copied = 0;
do {
+ /* Check to see if there's a ping ACK to reply to. */
+ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+
if (!skb) {
size_t size, chunk, max, space;
_debug("alloc");
if (call->tx_top - call->tx_hard_ack >=
- call->tx_winsize) {
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra)) {
ret = -EAGAIN;
if (msg->msg_flags & MSG_DONTWAIT)
goto maybe_error;
@@ -214,7 +216,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
goto maybe_error;
}
- max = call->conn->params.peer->maxdata;
+ max = RXRPC_JUMBO_DATALEN;
max -= call->conn->security_size;
max &= ~(call->conn->size_align - 1UL);
@@ -225,7 +227,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
space = chunk + call->conn->size_align;
space &= ~(call->conn->size_align - 1UL);
- size = space + call->conn->header_size;
+ size = space + call->conn->security_size;
_debug("SIZE: %zu/%zu/%zu", chunk, space, size);
@@ -235,15 +237,15 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
if (!skb)
goto maybe_error;
- rxrpc_new_skb(skb);
+ rxrpc_new_skb(skb, rxrpc_skb_tx_new);
_debug("ALLOC SEND %p", skb);
ASSERTCMP(skb->mark, ==, 0);
- _debug("HS: %u", call->conn->header_size);
- skb_reserve(skb, call->conn->header_size);
- skb->len += call->conn->header_size;
+ _debug("HS: %u", call->conn->security_size);
+ skb_reserve(skb, call->conn->security_size);
+ skb->len += call->conn->security_size;
sp = rxrpc_skb(skb);
sp->remain = chunk;
@@ -305,33 +307,21 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
seq = call->tx_top + 1;
- sp->hdr.epoch = conn->proto.epoch;
- sp->hdr.cid = call->cid;
- sp->hdr.callNumber = call->call_id;
sp->hdr.seq = seq;
- sp->hdr.serial = atomic_inc_return(&conn->serial);
- sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
- sp->hdr.userStatus = 0;
- sp->hdr.securityIndex = call->security_ix;
sp->hdr._rsvd = 0;
- sp->hdr.serviceId = call->service_id;
+ sp->hdr.flags = conn->out_clientflag;
- sp->hdr.flags = conn->out_clientflag;
if (msg_data_left(msg) == 0 && !more)
sp->hdr.flags |= RXRPC_LAST_PACKET;
else if (call->tx_top - call->tx_hard_ack <
call->tx_winsize)
sp->hdr.flags |= RXRPC_MORE_PACKETS;
- if (more && seq & 1)
- sp->hdr.flags |= RXRPC_REQUEST_ACK;
ret = conn->security->secure_packet(
- call, skb, skb->mark,
- skb->head + sizeof(struct rxrpc_wire_header));
+ call, skb, skb->mark, skb->head);
if (ret < 0)
goto out;
- rxrpc_insert_header(skb);
rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
skb = NULL;
}
@@ -345,7 +335,7 @@ out:
return ret;
call_terminated:
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
_leave(" = %d", -call->error);
return -call->error;
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 620d9cc..67b02c4 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,50 +18,67 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
+#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
+
/*
- * Note the existence of a new-to-us socket buffer (allocated or dequeued).
+ * Note the allocation or reception of a socket buffer.
*/
-void rxrpc_new_skb(struct sk_buff *skb)
+void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here);
+ int n = atomic_inc_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
}
/*
* Note the re-emergence of a socket buffer from a queue or buffer.
*/
-void rxrpc_see_skb(struct sk_buff *skb)
+void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
if (skb) {
- int n = atomic_read(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here);
+ int n = atomic_read(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
}
}
/*
* Note the addition of a ref on a socket buffer.
*/
-void rxrpc_get_skb(struct sk_buff *skb)
+void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here);
+ int n = atomic_inc_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
skb_get(skb);
}
/*
* Note the destruction of a socket buffer.
*/
-void rxrpc_free_skb(struct sk_buff *skb)
+void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+{
+ const void *here = __builtin_return_address(0);
+ if (skb) {
+ int n;
+ CHECK_SLAB_OKAY(&skb->users);
+ n = atomic_dec_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+ kfree_skb(skb);
+ }
+}
+
+/*
+ * Note the injected loss of a socket buffer.
+ */
+void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
if (skb) {
int n;
CHECK_SLAB_OKAY(&skb->users);
- n = atomic_dec_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here);
+ n = atomic_dec_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
kfree_skb(skb);
}
}
@@ -74,8 +91,9 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
const void *here = __builtin_return_address(0);
struct sk_buff *skb;
while ((skb = skb_dequeue((list))) != NULL) {
- int n = atomic_dec_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here);
+ int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
+ trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
+ atomic_read(&skb->users), n, here);
kfree_skb(skb);
}
}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index a03c61c..34c706d 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -35,7 +35,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_requested_ack_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&zero,
},
{
@@ -43,7 +43,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_soft_ack_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
{
@@ -51,7 +51,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_idle_ack_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
{
@@ -59,7 +59,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_resend_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
{
@@ -85,7 +85,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_max_call_lifetime,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7795d5a..87956a7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -793,6 +793,11 @@ config NET_IFE_SKBPRIO
depends on NET_ACT_IFE
---help---
+config NET_IFE_SKBTCINDEX
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+ ---help---
+
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 148ae0d..4bdda36 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d068..c910217 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -592,9 +592,19 @@ err_out:
return ERR_PTR(err);
}
-int tcf_action_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, char *name, int ovr,
- int bind, struct list_head *actions)
+static void cleanup_a(struct list_head *actions, int ovr)
+{
+ struct tc_action *a;
+
+ if (!ovr)
+ return;
+
+ list_for_each_entry(a, actions, list)
+ a->tcfa_refcnt--;
+}
+
+int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+ char *name, int ovr, int bind, struct list_head *actions)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
@@ -612,8 +622,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla,
goto err;
}
act->order = i;
+ if (ovr)
+ act->tcfa_refcnt++;
list_add_tail(&act->list, actions);
}
+
+ /* Remove the temp refcnt which was necessary to protect against
+ * destroying an existing action which was being replaced
+ */
+ cleanup_a(actions, ovr);
return 0;
err:
@@ -883,6 +900,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
goto err;
}
act->order = i;
+ if (event == RTM_GETACTION)
+ act->tcfa_refcnt++;
list_add_tail(&act->list, &actions);
}
@@ -923,9 +942,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
return err;
}
-static int
-tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
- u32 portid, int ovr)
+static int tcf_action_add(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid, int ovr)
{
int ret = 0;
LIST_HEAD(actions);
@@ -988,8 +1006,7 @@ replay:
return ret;
}
-static struct nlattr *
-find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
{
struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1016,8 +1033,7 @@ find_dump_kind(const struct nlmsghdr *n)
return kind;
}
-static int
-tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nlmsghdr *nlh;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b5dbf63..e0defce 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
return (void *)(skb_network_header(skb) + ihl);
}
-static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct icmphdr *icmph;
@@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct icmp6hdr *icmp6h;
const struct ipv6hdr *ip6h;
@@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct tcphdr *tcph;
const struct iphdr *iph;
@@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct tcphdr *tcph;
const struct ipv6hdr *ip6h;
@@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv4_udp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct iphdr *iph;
@@ -270,8 +270,8 @@ ignore_obscure_skb:
return 1;
}
-static int tcf_csum_ipv6_udp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct ipv6hdr *ip6h;
@@ -380,8 +380,8 @@ fail:
return 0;
}
-static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
- unsigned int ixhl, unsigned int *pl)
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl,
+ unsigned int *pl)
{
int off, len, optlen;
unsigned char *xh = (void *)ip6xh;
@@ -494,8 +494,8 @@ fail:
return 0;
}
-static int tcf_csum(struct sk_buff *skb,
- const struct tc_action *a, struct tcf_result *res)
+static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_csum *p = to_tcf_csum(a);
int action;
@@ -531,8 +531,8 @@ drop:
return TC_ACT_SHOT;
}
-static int tcf_csum_dump(struct sk_buff *skb,
- struct tc_action *a, int bind, int ref)
+static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_csum *p = to_tcf_csum(a);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e24a409..e0aa30f 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets);
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
+ packets);
if (action == TC_ACT_SHOT)
this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index e87cd81..95c463c 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -53,7 +53,7 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
u32 *tlv = (u32 *)(skbdata);
u16 totlen = nla_total_size(dlen); /*alignment + hdr */
char *dptr = (char *)tlv + NLA_HDRLEN;
- u32 htlv = attrtype << 16 | dlen;
+ u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
*tlv = htonl(htlv);
memset(dptr, 0, totlen - NLA_HDRLEN);
@@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
}
EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+ u16 edata = 0;
+
+ if (mi->metaval)
+ edata = *(u16 *)mi->metaval;
+ else if (metaval)
+ edata = metaval;
+
+ if (!edata) /* will not encode */
+ return 0;
+
+ edata = htons(edata);
+ return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
+
int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
{
if (mi->metaval)
@@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
}
EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
+{
+ if (metaval || mi->metaval)
+ return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u16);
+
int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
{
u32 edata = metaval;
@@ -627,7 +653,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
struct tcf_ife_info *ife = to_ife(a);
int action = ife->tcf_action;
struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
- u16 ifehdrln = ifehdr->metalen;
+ int ifehdrln = (int)ifehdr->metalen;
struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
spin_lock(&ife->tcf_lock);
@@ -740,8 +766,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
return TC_ACT_SHOT;
}
- iethh = eth_hdr(skb);
-
err = skb_cow_head(skb, hdrm);
if (unlikely(err)) {
ife->tcf_qstats.drops++;
@@ -752,6 +776,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
if (!(at & AT_EGRESS))
skb_push(skb, skb->dev->hard_header_len);
+ iethh = (struct ethhdr *)skb->data;
__skb_push(skb, hdrm);
memcpy(skb->data, iethh, skb->mac_len);
skb_reset_mac_header(skb);
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 0000000..3b35774
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifetc_index = skb->tc_index;
+
+ return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u16 ifetc_index = *(u16 *)data;
+
+ skb->tc_index = ntohs(ifetc_index);
+ return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+ .metaid = IFE_META_TCINDEX,
+ .metatype = NLA_U16,
+ .name = "tc_index",
+ .synopsis = "skb tc_index 16 bit metadata",
+ .check_presence = skbtcindex_check,
+ .encode = skbtcindex_encode,
+ .decode = skbtcindex_decode,
+ .get = ife_get_meta_u16,
+ .alloc = ife_alloc_meta_u16,
+ .release = ife_release_meta_gen,
+ .validate = ife_validate_meta_u16,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+ return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+ unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6038c85..667dc38 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -204,7 +204,15 @@ out:
return retval;
}
-static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse)
+{
+ tcf_lastuse_update(&a->tcfa_tm);
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+}
+
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = to_mirred(a);
@@ -280,6 +288,7 @@ static struct tc_action_ops act_mirred_ops = {
.type = TCA_ACT_MIRRED,
.owner = THIS_MODULE,
.act = tcf_mirred,
+ .stats_update = tcf_stats_update,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8a3be1d..d1bd248 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -249,6 +249,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
+ if (police->tcfp_result == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
@@ -261,8 +263,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
return police->tcf_action;
}
-static int
-tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = to_police(a);
@@ -347,14 +349,12 @@ static struct pernet_operations police_net_ops = {
.size = sizeof(struct tc_action_net),
};
-static int __init
-police_init_module(void)
+static int __init police_init_module(void)
{
return tcf_register_action(&act_police_ops, &police_net_ops);
}
-static void __exit
-police_cleanup_module(void)
+static void __exit police_cleanup_module(void)
{
tcf_unregister_action(&act_police_ops, &police_net_ops);
}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 59a8d31..b57fcbc 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,12 +30,19 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
struct tcf_vlan *v = to_vlan(a);
int action;
int err;
+ u16 tci;
spin_lock(&v->tcf_lock);
tcf_lastuse_update(&v->tcf_tm);
bstats_update(&v->tcf_bstats, skb);
action = v->tcf_action;
+ /* Ensure 'data' points at mac_header prior calling vlan manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb))
+ skb_push_rcsum(skb, skb->mac_len);
+
switch (v->tcfv_action) {
case TCA_VLAN_ACT_POP:
err = skb_vlan_pop(skb);
@@ -48,6 +55,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
if (err)
goto drop;
break;
+ case TCA_VLAN_ACT_MODIFY:
+ /* No-op if no vlan tag (either hw-accel or in-payload) */
+ if (!skb_vlan_tagged(skb))
+ goto unlock;
+ /* extract existing tag (and guarantee no hw-accel tag) */
+ if (skb_vlan_tag_present(skb)) {
+ tci = skb_vlan_tag_get(skb);
+ skb->vlan_tci = 0;
+ } else {
+ /* in-payload vlan tag, pop it */
+ err = __skb_vlan_pop(skb, &tci);
+ if (err)
+ goto drop;
+ }
+ /* replace the vid */
+ tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+ /* replace prio bits, if tcfv_push_prio specified */
+ if (v->tcfv_push_prio) {
+ tci &= ~VLAN_PRIO_MASK;
+ tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+ }
+ /* put updated tci as hwaccel tag */
+ __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+ break;
default:
BUG();
}
@@ -58,6 +89,9 @@ drop:
action = TC_ACT_SHOT;
v->tcf_qstats.drops++;
unlock:
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
spin_unlock(&v->tcf_lock);
return action;
}
@@ -102,6 +136,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
case TCA_VLAN_ACT_POP:
break;
case TCA_VLAN_ACT_PUSH:
+ case TCA_VLAN_ACT_MODIFY:
if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
if (exists)
tcf_hash_release(*a, bind);
@@ -185,7 +220,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+ if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
+ v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
(nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
v->tcfv_push_proto) ||
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a7c5645..11da7da 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -344,13 +344,15 @@ replay:
if (err == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
- tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ tfilter_notify(net, skb, n, tp, fh,
+ RTM_DELTFILTER);
if (tcf_destroy(tp, false))
RCU_INIT_POINTER(*back, next);
}
goto errout;
case RTM_GETTFILTER:
- err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+ err = tfilter_notify(net, skb, n, tp, fh,
+ RTM_NEWTFILTER);
goto errout;
default:
err = -EINVAL;
@@ -448,7 +450,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct net *net = sock_net(a->skb->sk);
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTFILTER);
}
/* called with RTNL */
@@ -552,7 +555,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
EXPORT_SYMBOL(tcf_exts_destroy);
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
- struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
+ struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
{
#ifdef CONFIG_NET_CLS_ACT
{
@@ -560,8 +563,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
- "police", ovr,
- TCA_ACT_BIND);
+ "police", ovr, TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -573,8 +575,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
int err, i = 0;
err = tcf_action_init(net, tb[exts->action], rate_tlv,
- NULL, ovr,
- TCA_ACT_BIND, &actions);
+ NULL, ovr, TCA_ACT_BIND,
+ &actions);
if (err)
return err;
list_for_each_entry(act, &actions, list)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1d92d4d..bb1d5a4 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");
#define CLS_BPF_NAME_LEN 256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS \
+ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
struct cls_bpf_head {
struct list_head plist;
@@ -39,6 +41,8 @@ struct cls_bpf_prog {
struct list_head link;
struct tcf_result res;
bool exts_integrated;
+ bool offloaded;
+ u32 gen_flags;
struct tcf_exts exts;
u32 handle;
union {
@@ -54,8 +58,10 @@ struct cls_bpf_prog {
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
[TCA_BPF_CLASSID] = { .type = NLA_U32 },
[TCA_BPF_FLAGS] = { .type = NLA_U32 },
+ [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 },
[TCA_BPF_FD] = { .type = NLA_U32 },
- [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
+ [TCA_BPF_NAME] = { .type = NLA_NUL_STRING,
+ .len = CLS_BPF_NAME_LEN },
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
[TCA_BPF_OPS] = { .type = NLA_BINARY,
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -90,7 +96,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
- if (at_ingress) {
+ if (tc_skip_sw(prog->gen_flags)) {
+ filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
+ } else if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
bpf_compute_data_end(skb);
@@ -137,6 +145,91 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
return !prog->bpf_ops;
}
+static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ enum tc_clsbpf_command cmd)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_bpf_offload bpf_offload = {};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSBPF;
+ offload.cls_bpf = &bpf_offload;
+
+ bpf_offload.command = cmd;
+ bpf_offload.exts = &prog->exts;
+ bpf_offload.prog = prog->filter;
+ bpf_offload.name = prog->bpf_name;
+ bpf_offload.exts_integrated = prog->exts_integrated;
+ bpf_offload.gen_flags = prog->gen_flags;
+
+ return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+}
+
+static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ struct cls_bpf_prog *oldprog)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct cls_bpf_prog *obj = prog;
+ enum tc_clsbpf_command cmd;
+ bool skip_sw;
+ int ret;
+
+ skip_sw = tc_skip_sw(prog->gen_flags) ||
+ (oldprog && tc_skip_sw(oldprog->gen_flags));
+
+ if (oldprog && oldprog->offloaded) {
+ if (tc_should_offload(dev, tp, prog->gen_flags)) {
+ cmd = TC_CLSBPF_REPLACE;
+ } else if (!tc_skip_sw(prog->gen_flags)) {
+ obj = oldprog;
+ cmd = TC_CLSBPF_DESTROY;
+ } else {
+ return -EINVAL;
+ }
+ } else {
+ if (!tc_should_offload(dev, tp, prog->gen_flags))
+ return skip_sw ? -EINVAL : 0;
+ cmd = TC_CLSBPF_ADD;
+ }
+
+ ret = cls_bpf_offload_cmd(tp, obj, cmd);
+ if (ret)
+ return skip_sw ? ret : 0;
+
+ obj->offloaded = true;
+ if (oldprog)
+ oldprog->offloaded = false;
+
+ return 0;
+}
+
+static void cls_bpf_stop_offload(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ int err;
+
+ if (!prog->offloaded)
+ return;
+
+ err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+ if (err) {
+ pr_err("Stopping hardware offload failed: %d\n", err);
+ return;
+ }
+
+ prog->offloaded = false;
+}
+
+static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ if (!prog->offloaded)
+ return;
+
+ cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+}
+
static int cls_bpf_init(struct tcf_proto *tp)
{
struct cls_bpf_head *head;
@@ -176,6 +269,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
{
struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
+ cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -192,6 +286,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
return false;
list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+ cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -301,6 +396,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
{
bool is_bpf, is_ebpf, have_exts = false;
struct tcf_exts exts;
+ u32 gen_flags = 0;
int ret;
is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -325,8 +421,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
}
+ if (tb[TCA_BPF_FLAGS_GEN]) {
+ gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+ if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+ !tc_flags_valid(gen_flags)) {
+ ret = -EINVAL;
+ goto errout;
+ }
+ }
prog->exts_integrated = have_exts;
+ prog->gen_flags = gen_flags;
ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
cls_bpf_prog_from_efd(tb, prog, tp);
@@ -409,10 +514,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+ ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE],
+ ovr);
if (ret < 0)
goto errout;
+ ret = cls_bpf_offload(tp, prog, oldprog);
+ if (ret) {
+ cls_bpf_delete_prog(tp, prog);
+ return ret;
+ }
+
if (oldprog) {
list_replace_rcu(&oldprog->link, &prog->link);
tcf_unbind_filter(tp, &oldprog->res);
@@ -474,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
tm->tcm_handle = prog->handle;
+ cls_bpf_offload_update_stats(tp, prog);
+
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -496,6 +610,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
goto nla_put_failure;
+ if (prog->gen_flags &&
+ nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a379bae..e396723 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
}
-static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return flow->basic.ip_proto;
}
-static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
if (flow->ports.ports)
return ntohs(flow->ports.src);
@@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
return addr_fold(skb->sk);
}
-static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
if (flow->ports.ports)
return ntohs(flow->ports.dst);
@@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
@@ -161,7 +165,8 @@ fallback:
return flow_get_src(skb, flow);
}
-static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
@@ -173,14 +178,16 @@ fallback:
return flow_get_dst(skb, flow);
}
-static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
return flow_get_proto_src(skb, flow);
}
-static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index a3f4c70..f6f40fb 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -241,7 +241,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = &offload;
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+ &tc);
if (tc_skip_sw(flags))
return err;
@@ -480,7 +481,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
}
fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
- &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC,
sizeof(key->enc_key_id.keyid));
return 0;
@@ -918,7 +919,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
- &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id, TCA_FLOWER_UNSPEC,
sizeof(key->enc_key_id)))
goto nla_put_failure;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index cc0bda9..9dc63d5 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle)
}
static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+ struct tcf_result *res)
{
struct fw_head *head = rcu_dereference_bh(tp->root);
struct fw_filter *f;
@@ -188,7 +188,8 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
static int
fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
- struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
+ struct nlattr **tb, struct nlattr **tca, unsigned long base,
+ bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct tcf_exts e;
@@ -237,9 +238,8 @@ errout:
static int fw_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ u32 handle, struct nlattr **tca, unsigned long *arg,
+ bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *) *arg;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index c91e65d..455fc8f 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp)
return 0;
}
-static void
-route4_delete_filter(struct rcu_head *head)
+static void route4_delete_filter(struct rcu_head *head)
{
struct route4_filter *f = container_of(head, struct route4_filter, rcu);
@@ -474,10 +473,8 @@ errout:
}
static int route4_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter __rcu **fp;
@@ -562,7 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout:
- tcf_exts_destroy(&f->exts);
+ if (f)
+ tcf_exts_destroy(&f->exts);
kfree(f);
return err;
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index d950070..96144bd 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -50,14 +50,13 @@ struct tcindex_data {
struct rcu_head rcu;
};
-static inline int
-tcindex_filter_is_set(struct tcindex_filter_result *r)
+static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
{
return tcf_exts_is_predicative(&r->exts) || r->res.classid;
}
-static struct tcindex_filter_result *
-tcindex_lookup(struct tcindex_data *p, u16 key)
+static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
+ u16 key)
{
if (p->perfect) {
struct tcindex_filter_result *f = p->perfect + key;
@@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head)
static void tcindex_destroy_fexts(struct rcu_head *head)
{
- struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu);
+ struct tcindex_filter *f = container_of(head, struct tcindex_filter,
+ rcu);
tcf_exts_destroy(&f->result.exts);
kfree(f);
@@ -550,7 +550,7 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index a29263a..ae83c3ae 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key,
return h;
}
-static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
@@ -256,8 +257,7 @@ deadloop:
return -1;
}
-static struct tc_u_hnode *
-u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
@@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
return ht;
}
-static struct tc_u_knode *
-u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
unsigned int sel;
struct tc_u_knode *n = NULL;
@@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp)
return 0;
}
-static int u32_destroy_key(struct tcf_proto *tp,
- struct tc_u_knode *n,
+static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
bool free_pf)
{
tcf_exts_destroy(&n->exts);
@@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
}
}
-static int u32_replace_hw_hnode(struct tcf_proto *tp,
- struct tc_u_hnode *h,
- u32 flags)
+static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+ u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
@@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
}
}
-static int u32_replace_hw_knode(struct tcf_proto *tp,
- struct tc_u_knode *n,
- u32 flags)
+static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
@@ -763,8 +759,7 @@ errout:
return err;
}
-static void u32_replace_knode(struct tcf_proto *tp,
- struct tc_u_common *tp_c,
+static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
struct tc_u_knode *n)
{
struct tc_u_knode __rcu **ins;
@@ -845,8 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
@@ -1088,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t)
{
struct tc_u_knode *n = (struct tc_u_knode *)fh;
struct tc_u_hnode *ht_up, *ht_down;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d677b34..206dc24 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -389,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
static struct qdisc_rate_table *qdisc_rtab_list;
-struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
+ struct nlattr *tab)
{
struct qdisc_rate_table *rtab;
@@ -541,7 +542,8 @@ nla_put_failure:
return -1;
}
-void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+ const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -888,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock;
Parameters are passed via opt.
*/
-static struct Qdisc *
-qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
- struct Qdisc *p, u32 parent, u32 handle,
- struct nlattr **tca, int *errp)
+static struct Qdisc *qdisc_create(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ struct Qdisc *p, u32 parent, u32 handle,
+ struct nlattr **tca, int *errp)
{
int err;
struct nlattr *kind = tca[TCA_KIND];
@@ -1073,7 +1075,8 @@ struct check_loop_arg {
int depth;
};
-static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+static int check_loop_fn(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w);
static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
{
@@ -1450,7 +1453,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
} else {
if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
}
@@ -1471,7 +1475,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
}
if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
}
@@ -1505,7 +1510,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
s_q_idx = 0;
q_idx = 0;
- if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, true) < 0)
+ if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
+ true) < 0)
goto done;
dev_queue = dev_ingress_queue(dev);
@@ -1640,7 +1646,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
if (cops->delete)
err = cops->delete(q, cl);
if (err == 0)
- tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
+ tclass_notify(net, skb, n, q, cl,
+ RTM_DELTCLASS);
goto out;
case RTM_GETTCLASS:
err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
@@ -1738,12 +1745,14 @@ struct qdisc_dump_args {
struct netlink_callback *cb;
};
-static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *arg)
{
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTCLASS);
}
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
@@ -1976,10 +1985,12 @@ static int __init pktsched_init(void)
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
+ NULL);
rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
+ NULL);
return 0;
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 4002df3..5bfa79e 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -69,7 +69,7 @@ struct codel_sched_data {
static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
{
struct Qdisc *sch = ctx;
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
if (skb)
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index baeed6a..1e37247 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_drop(skb, sch, to_free);
@@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
unsigned int prev_backlog;
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
prev_backlog = sch->qstats.backlog;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index dc52cc1..18e7524 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
@@ -94,6 +95,7 @@ struct fq_sched_data {
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
u32 orphan_mask; /* mask for orphaned skb */
+ u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
@@ -407,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
+ unsigned long sample;
struct rb_node *p;
if (q->time_next_delayed_flow > now)
return;
+ /* Update unthrottle latency EWMA.
+ * This is cheap and can help diagnosing timer/latency problems.
+ */
+ sample = (unsigned long)(now - q->time_next_delayed_flow);
+ q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+ q->unthrottle_latency_ns += sample >> 3;
+
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
- u32 rate;
+ u32 rate, plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
@@ -482,7 +492,7 @@ begin:
prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb);
- if (f->credit > 0 || !q->rate_enable)
+ if (!q->rate_enable)
goto out;
/* Do not pace locally generated ack packets */
@@ -493,8 +503,15 @@ begin:
if (skb->sk)
rate = min(skb->sk->sk_pacing_rate, rate);
+ if (rate <= q->low_rate_threshold) {
+ f->credit = 0;
+ plen = qdisc_pkt_len(skb);
+ } else {
+ plen = max(qdisc_pkt_len(skb), q->quantum);
+ if (f->credit > 0)
+ goto out;
+ }
if (rate != ~0U) {
- u32 plen = max(qdisc_pkt_len(skb), q->quantum);
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
@@ -507,7 +524,12 @@ begin:
len = NSEC_PER_SEC;
q->stat_pkts_too_long++;
}
-
+ /* Account for schedule/timers drifts.
+ * f->time_next_packet was set when prior packet was sent,
+ * and current time (@now) can be too late by tens of us.
+ */
+ if (f->time_next_packet)
+ len -= min(len/2, now - f->time_next_packet);
f->time_next_packet = now + len;
}
out:
@@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+ q->low_rate_threshold =
+ nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
@@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U;
+ q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
q->old_flows.first = NULL;
@@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
+ q->low_rate_threshold = 550000 / 8;
qdisc_watchdog_init(&q->watchdog, sch);
if (opt)
@@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+ q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -838,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.flows = q->flows;
st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows;
- st.pad = 0;
-
+ st.unthrottle_latency_ns = min_t(unsigned long,
+ q->unthrottle_latency_ns, ~0U);
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0d21b56..6cfb6e9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
*/
struct pfifo_fast_priv {
u32 bitmap;
- struct sk_buff_head q[PFIFO_FAST_BANDS];
+ struct qdisc_skb_head q[PFIFO_FAST_BANDS];
};
/*
@@ -477,7 +477,7 @@ struct pfifo_fast_priv {
*/
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
int band)
{
return priv->q + band;
@@ -486,10 +486,10 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+ if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *list = band2list(priv, band);
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
@@ -505,11 +505,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (likely(band >= 0)) {
- struct sk_buff_head *list = band2list(priv, band);
- struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+ struct qdisc_skb_head *qh = band2list(priv, band);
+ struct sk_buff *skb = __qdisc_dequeue_head(qh);
+
+ if (likely(skb != NULL)) {
+ qdisc_qstats_backlog_dec(qdisc, skb);
+ qdisc_bstats_update(qdisc, skb);
+ }
qdisc->q.qlen--;
- if (skb_queue_empty(list))
+ if (qh->qlen == 0)
priv->bitmap &= ~(1 << band);
return skb;
@@ -524,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (band >= 0) {
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *qh = band2list(priv, band);
- return skb_peek(list);
+ return qh->head;
}
return NULL;
@@ -564,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __skb_queue_head_init(band2list(priv, prio));
+ qdisc_skb_head_init(band2list(priv, prio));
/* Can by-pass the queue discipline */
qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -612,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
}
- skb_queue_head_init(&sch->q);
+ qdisc_skb_head_init(&sch->q);
+ spin_lock_init(&sch->q.lock);
spin_lock_init(&sch->busylock);
lockdep_set_class(&sch->busylock,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa1..c798d0d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,7 +162,7 @@ struct htb_sched {
struct work_struct work;
/* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
+ struct qdisc_skb_head direct_queue;
long direct_pkts;
struct qdisc_watchdog watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
list_del_init(&cl->un.leaf.drop_list);
}
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+ struct qdisc_skb_head *qh)
+{
+ struct sk_buff *last = qh->tail;
+
+ if (last) {
+ skb->next = NULL;
+ last->next = skb;
+ qh->tail = skb;
+ } else {
+ qh->tail = skb;
+ qh->head = skb;
+ }
+ qh->qlen++;
+}
+
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen) {
- __skb_queue_tail(&q->direct_queue, skb);
+ htb_enqueue_tail(skb, sch, &q->direct_queue);
q->direct_pkts++;
} else {
return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
unsigned long start_at;
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
- skb = __skb_dequeue(&q->direct_queue);
+ skb = __qdisc_dequeue_head(&q->direct_queue);
if (skb != NULL) {
ok:
qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
INIT_WORK(&q->work, htb_work_func);
- __skb_queue_head_init(&q->direct_queue);
+ qdisc_skb_head_init(&q->direct_queue);
if (tb[TCA_HTB_DIRECT_QLEN])
q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index aaaf021..9f7b380 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
return segs;
}
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+ skb->next = qh->head;
+
+ if (!qh->head)
+ qh->tail = skb;
+ qh->head = skb;
+ qh->qlen++;
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -502,7 +512,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1<<(prandom_u32() % 8);
}
- if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
@@ -522,8 +532,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (q->rate) {
struct sk_buff *last;
- if (!skb_queue_empty(&sch->q))
- last = skb_peek_tail(&sch->q);
+ if (sch->q.qlen)
+ last = sch->q.tail;
else
last = netem_rb_to_skb(rb_last(&q->t_root));
if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cb->time_to_send = psched_get_time();
q->counter = 0;
- __skb_queue_head(&sch->q, skb);
+ netem_enqueue_skb_head(&sch->q, skb);
sch->qstats.requeues++;
}
@@ -587,7 +597,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct rb_node *p;
tfifo_dequeue:
- skb = __skb_dequeue(&sch->q);
+ skb = __qdisc_dequeue_head(&sch->q);
if (skb) {
qdisc_qstats_backlog_dec(sch, skb);
deliver:
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index a570b0b..5c3a99d 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
@@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
- skb = __qdisc_dequeue_head(sch, &sch->q);
+ skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index f27ffee..ca0516e 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1153,6 +1153,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
if (!skb)
return NULL;
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
qdisc_bstats_update(sch, skb);
@@ -1256,6 +1257,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
bstats_update(&cl->bstats, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
++sch->q.qlen;
agg = cl->agg;
@@ -1476,6 +1478,7 @@ static void qfq_reset_qdisc(struct Qdisc *sch)
qdisc_reset(cl->qdisc);
}
}
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index add3cc7..20a350b 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -400,6 +400,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
enqueue:
ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
increment_qlen(skb, q);
} else if (net_xmit_drop_count(ret)) {
@@ -428,6 +429,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
decrement_qlen(skb, q);
}
@@ -450,6 +452,7 @@ static void sfb_reset(struct Qdisc *sch)
struct sfb_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
q->slot = 0;
q->double_buffering = false;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1c23060..f10d339 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
transports) {
if (t->pmtu_pending && t->dst) {
sctp_transport_update_pmtu(sk, t,
- WORD_TRUNC(dst_mtu(t->dst)));
+ SCTP_TRUNC4(dst_mtu(t->dst)));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index af9cc80..7a1cdf4 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -192,12 +192,18 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
msg, msg->expires_at, jiffies);
}
+ if (asoc->peer.prsctp_capable &&
+ SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
+ msg->expires_at =
+ jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
+
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
- max_data = (asoc->pathmtu -
- sctp_sk(asoc->base.sk)->pf->af->net_header_len -
- sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
+ max_data = asoc->pathmtu -
+ sctp_sk(asoc->base.sk)->pf->af->net_header_len -
+ sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+ max_data = SCTP_TRUNC4(max_data);
max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
@@ -208,8 +214,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
if (hmac_desc)
- max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
- hmac_desc->hmac_len);
+ max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
+ hmac_desc->hmac_len);
}
/* Now, check if we need to reduce our max */
@@ -229,7 +235,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
msg_len > max)
- max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+ max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
@@ -348,7 +354,7 @@ errout:
/* Check whether this message has expired. */
int sctp_chunk_abandoned(struct sctp_chunk *chunk)
{
- if (!chunk->asoc->prsctp_enable ||
+ if (!chunk->asoc->peer.prsctp_capable ||
!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
struct sctp_datamsg *msg = chunk->msg;
@@ -362,14 +368,14 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
}
if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
- time_after(jiffies, chunk->prsctp_param)) {
+ time_after(jiffies, chunk->msg->expires_at)) {
if (chunk->sent_count)
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
else
chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
return 1;
} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
- chunk->sent_count > chunk->prsctp_param) {
+ chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
return 1;
}
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 69444d3..a2ea1d1 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
/* PMTU discovery (RFC1191) */
if (ICMP_FRAG_NEEDED == code) {
sctp_icmp_frag_needed(sk, asoc, transport,
- WORD_TRUNC(info));
+ SCTP_TRUNC4(info));
goto out_unlock;
} else {
if (ICMP_PROT_UNREACH == code) {
@@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = offset + WORD_ROUND(ntohs(ch->length));
+ ch_end = offset + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb->len)
break;
@@ -796,27 +796,34 @@ struct sctp_hash_cmp_arg {
static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
+ struct sctp_transport *t = (struct sctp_transport *)ptr;
const struct sctp_hash_cmp_arg *x = arg->key;
- const struct sctp_transport *t = ptr;
- struct sctp_association *asoc = t->asoc;
- const struct net *net = x->net;
+ struct sctp_association *asoc;
+ int err = 1;
if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
- return 1;
- if (!net_eq(sock_net(asoc->base.sk), net))
- return 1;
+ return err;
+ if (!sctp_transport_hold(t))
+ return err;
+
+ asoc = t->asoc;
+ if (!net_eq(sock_net(asoc->base.sk), x->net))
+ goto out;
if (x->ep) {
if (x->ep != asoc->ep)
- return 1;
+ goto out;
} else {
if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
- return 1;
+ goto out;
if (!sctp_bind_addr_match(&asoc->base.bind_addr,
x->laddr, sctp_sk(asoc->base.sk)))
- return 1;
+ goto out;
}
- return 0;
+ err = 0;
+out:
+ sctp_transport_put(t);
+ return err;
}
static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
@@ -1121,7 +1128,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb_tail_pointer(skb))
break;
@@ -1190,7 +1197,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
* that the chunk length doesn't cause overflow. Otherwise, we'll
* walk off the end.
*/
- if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+ if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
return NULL;
/* If this is INIT/INIT-ACK look inside the chunk too. */
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 6437aa9..f731de3e8 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -213,7 +213,7 @@ new_skb:
}
chunk->chunk_hdr = ch;
- chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
chunk->subh.v = NULL; /* Subheader is no longer valid. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 0c605ec..2a5c189 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -297,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
struct sctp_chunk *chunk)
{
sctp_xmit_t retval = SCTP_XMIT_OK;
- __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+ __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
/* Check to see if this chunk will fit into the packet */
retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -508,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
if (gso) {
pkt_size = packet->overhead;
list_for_each_entry(chunk, &packet->chunk_list, list) {
- int padded = WORD_ROUND(chunk->skb->len);
+ int padded = SCTP_PAD4(chunk->skb->len);
if (pkt_size + padded > tp->pathmtu)
break;
@@ -538,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
* included in the chunk length field. The sender should
* never pad with more than 3 bytes.
*
- * [This whole comment explains WORD_ROUND() below.]
+ * [This whole comment explains SCTP_PAD4() below.]
*/
pkt_size -= packet->overhead;
@@ -560,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
has_data = 1;
}
- padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+ padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
if (padding)
memset(skb_put(chunk->skb, padding), 0, padding);
@@ -587,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
* acknowledged or have failed.
* Re-queue auth chunks if needed.
*/
- pkt_size -= WORD_ROUND(chunk->skb->len);
+ pkt_size -= SCTP_PAD4(chunk->skb->len);
if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
sctp_chunk_free(chunk);
@@ -911,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
*/
maxsize = pmtu - packet->overhead;
if (packet->auth)
- maxsize -= WORD_ROUND(packet->auth->skb->len);
+ maxsize -= SCTP_PAD4(packet->auth->skb->len);
if (chunk_len > maxsize)
retval = SCTP_XMIT_PMTU_FULL;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 8c3f446..5825853 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -304,7 +304,7 @@ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
"illegal chunk");
sctp_outq_tail_data(q, chunk);
- if (chunk->asoc->prsctp_enable &&
+ if (chunk->asoc->peer.prsctp_capable &&
SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
chunk->asoc->sent_cnt_removable++;
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
@@ -354,7 +354,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
- chk->prsctp_param <= sinfo->sinfo_timetolive)
+ chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
continue;
list_del_init(&chk->transmitted_list);
@@ -389,7 +389,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
list_for_each_entry_safe(chk, temp, queue, list) {
if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
- chk->prsctp_param <= sinfo->sinfo_timetolive)
+ chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
continue;
list_del_init(&chk->list);
@@ -413,7 +413,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc,
{
struct sctp_transport *transport;
- if (!asoc->prsctp_enable || !asoc->sent_cnt_removable)
+ if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable)
return;
msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
@@ -1026,7 +1026,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
/* Mark as failed send. */
sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
- if (asoc->prsctp_enable &&
+ if (asoc->peer.prsctp_capable &&
SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
asoc->sent_cnt_removable--;
sctp_chunk_free(chunk);
@@ -1319,7 +1319,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
tsn = ntohl(tchunk->subh.data_hdr->tsn);
if (TSN_lte(tsn, ctsn)) {
list_del_init(&tchunk->transmitted_list);
- if (asoc->prsctp_enable &&
+ if (asoc->peer.prsctp_capable &&
SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
asoc->sent_cnt_removable--;
sctp_chunk_free(tchunk);
@@ -1719,7 +1719,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
{
int i;
sctp_sack_variable_t *frags;
- __u16 gap;
+ __u16 tsn_offset, blocks;
__u32 ctsn = ntohl(sack->cum_tsn_ack);
if (TSN_lte(tsn, ctsn))
@@ -1738,10 +1738,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
*/
frags = sack->variable;
- gap = tsn - ctsn;
- for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
- if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
- TSN_lte(gap, ntohs(frags[i].gab.end)))
+ blocks = ntohs(sack->num_gap_ack_blocks);
+ tsn_offset = tsn - ctsn;
+ for (i = 0; i < blocks; ++i) {
+ if (tsn_offset >= ntohs(frags[i].gab.start) &&
+ tsn_offset <= ntohs(frags[i].gab.end))
goto pass;
}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ef8ba77..206377f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -73,13 +73,17 @@ static const struct snmp_mib sctp_snmp_list[] = {
/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
{
+ unsigned long buff[SCTP_MIB_MAX];
struct net *net = seq->private;
int i;
- for (i = 0; sctp_snmp_list[i].name != NULL; i++)
+ memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX);
+
+ snmp_get_cpu_field_batch(buff, sctp_snmp_list,
+ net->sctp.sctp_statistics);
+ for (i = 0; sctp_snmp_list[i].name; i++)
seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
- snmp_fold_field(net->sctp.sctp_statistics,
- sctp_snmp_list[i].entry));
+ buff[i]);
return 0;
}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 807158e3..048954e 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -276,28 +276,17 @@ out:
return err;
}
-static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
+static int sctp_sock_dump(struct sock *sk, void *p)
{
- struct sctp_endpoint *ep = tsp->asoc->ep;
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct sctp_comm_param *commp = p;
- struct sock *sk = ep->base.sk;
struct sk_buff *skb = commp->skb;
struct netlink_callback *cb = commp->cb;
const struct inet_diag_req_v2 *r = commp->r;
- struct sctp_association *assoc =
- list_entry(ep->asocs.next, struct sctp_association, asocs);
+ struct sctp_association *assoc;
int err = 0;
- /* find the ep only once through the transports by this condition */
- if (tsp->asoc != assoc)
- goto out;
-
- if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
- goto out;
-
lock_sock(sk);
- if (sk != assoc->base.sk)
- goto release;
list_for_each_entry(assoc, &ep->asocs, asocs) {
if (cb->args[4] < cb->args[1])
goto next;
@@ -317,7 +306,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
NLM_F_MULTI, cb->nlh,
commp->net_admin) < 0) {
cb->args[3] = 1;
- err = 2;
+ err = 1;
goto release;
}
cb->args[3] = 1;
@@ -327,7 +316,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0, cb->nlh,
commp->net_admin) < 0) {
- err = 2;
+ err = 1;
goto release;
}
next:
@@ -339,10 +328,35 @@ next:
cb->args[4] = 0;
release:
release_sock(sk);
+ sock_put(sk);
return err;
+}
+
+static int sctp_get_sock(struct sctp_transport *tsp, void *p)
+{
+ struct sctp_endpoint *ep = tsp->asoc->ep;
+ struct sctp_comm_param *commp = p;
+ struct sock *sk = ep->base.sk;
+ struct netlink_callback *cb = commp->cb;
+ const struct inet_diag_req_v2 *r = commp->r;
+ struct sctp_association *assoc =
+ list_entry(ep->asocs.next, struct sctp_association, asocs);
+
+ /* find the ep only once through the transports by this condition */
+ if (tsp->asoc != assoc)
+ goto out;
+
+ if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
+ goto out;
+
+ sock_hold(sk);
+ cb->args[5] = (long)sk;
+
+ return 1;
+
out:
cb->args[2]++;
- return err;
+ return 0;
}
static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
@@ -480,10 +494,18 @@ skip:
* 2 : to record the transport pos of this time's traversal
* 3 : to mark if we have dumped the ep info of the current asoc
* 4 : to work as a temporary variable to traversal list
+ * 5 : to save the sk we get from travelsing the tsp list.
*/
if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
goto done;
- sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp);
+
+next:
+ cb->args[5] = 0;
+ sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp);
+
+ if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp))
+ goto next;
+
done:
cb->args[1] = cb->args[4];
cb->args[4] = 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8c77b87..9e9690b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
num_types = sp->pf->supported_addrs(sp, types);
chunksize = sizeof(init) + addrs_len;
- chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+ chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
chunksize += sizeof(ecap_param);
if (asoc->prsctp_enable)
@@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
/* Add HMACS parameter length if any were defined */
auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
if (auth_hmacs->length)
- chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
else
auth_hmacs = NULL;
/* Add CHUNKS parameter length */
auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
if (auth_chunks->length)
- chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
else
auth_chunks = NULL;
@@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
/* If we have any extensions to report, account for that */
if (num_ext)
- chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
- num_ext);
+ chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
/* RFC 2960 3.3.2 Initiation (INIT) (1)
*
@@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
if (auth_hmacs->length)
- chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
else
auth_hmacs = NULL;
auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
if (auth_chunks->length)
- chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
else
auth_chunks = NULL;
@@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
}
if (num_ext)
- chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
- num_ext);
+ chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
/* Now allocate and fill out the chunk. */
retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -706,20 +706,6 @@ nodata:
return retval;
}
-static void sctp_set_prsctp_policy(struct sctp_chunk *chunk,
- const struct sctp_sndrcvinfo *sinfo)
-{
- if (!chunk->asoc->prsctp_enable)
- return;
-
- if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
- chunk->prsctp_param =
- jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
- else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) ||
- SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags))
- chunk->prsctp_param = sinfo->sinfo_timetolive;
-}
-
/* Make a DATA chunk for the given association from the provided
* parameters. However, do not populate the data payload.
*/
@@ -753,7 +739,6 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
- sctp_set_prsctp_policy(retval, sinfo);
nodata:
return retval;
@@ -1390,7 +1375,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
struct sock *sk;
/* No need to allocate LL here, as this is only a chunk. */
- skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
+ skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
if (!skb)
goto nodata;
@@ -1482,7 +1467,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
void *target;
void *padding;
int chunklen = ntohs(chunk->chunk_hdr->length);
- int padlen = WORD_ROUND(chunklen) - chunklen;
+ int padlen = SCTP_PAD4(chunklen) - chunklen;
padding = skb_put(chunk->skb, padlen);
target = skb_put(chunk->skb, len);
@@ -1900,7 +1885,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc,
struct __sctp_missing report;
__u16 len;
- len = WORD_ROUND(sizeof(report));
+ len = SCTP_PAD4(sizeof(report));
/* Make an ERROR chunk, preparing enough room for
* returning multiple unknown parameters.
@@ -2098,9 +2083,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
if (*errp) {
if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- WORD_ROUND(ntohs(param.p->length))))
+ SCTP_PAD4(ntohs(param.p->length))))
sctp_addto_chunk_fixed(*errp,
- WORD_ROUND(ntohs(param.p->length)),
+ SCTP_PAD4(ntohs(param.p->length)),
param.v);
} else {
/* If there is no memory for generating the ERROR
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d88bb2b..026e3bc 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
}
/* Report violation if chunk len overflows */
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb_tail_pointer(skb))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
@@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
hdr = unk_chunk->chunk_hdr;
err_chunk = sctp_make_op_error(asoc, unk_chunk,
SCTP_ERROR_UNKNOWN_CHUNK, hdr,
- WORD_ROUND(ntohs(hdr->length)),
+ SCTP_PAD4(ntohs(hdr->length)),
0);
if (err_chunk) {
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
hdr = unk_chunk->chunk_hdr;
err_chunk = sctp_make_op_error(asoc, unk_chunk,
SCTP_ERROR_UNKNOWN_CHUNK, hdr,
- WORD_ROUND(ntohs(hdr->length)),
+ SCTP_PAD4(ntohs(hdr->length)),
0);
if (err_chunk) {
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6cdc61c..fb02c70 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4473,17 +4473,21 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
const union sctp_addr *paddr, void *p)
{
struct sctp_transport *transport;
- int err = 0;
+ int err = -ENOENT;
rcu_read_lock();
transport = sctp_addrs_lookup_transport(net, laddr, paddr);
if (!transport || !sctp_transport_hold(transport))
goto out;
- err = cb(transport, p);
+
+ sctp_association_hold(transport->asoc);
sctp_transport_put(transport);
-out:
rcu_read_unlock();
+ err = cb(transport, p);
+ sctp_association_put(transport->asoc);
+
+out:
return err;
}
EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 81b8667..ce54dce 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
}
if (transport->dst) {
- transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+ transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
} else
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
@@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport,
return;
}
if (transport->dst) {
- transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+ transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
/* Initialize sk->sk_rcv_saddr, if the transport is the
* association's active path for getsockname().
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d85b803..bea0005 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
ch = (sctp_errhdr_t *)(chunk->skb->data);
cause = ch->cause;
- elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+ elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
/* Pull off the ERROR header. */
skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
@@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
* MUST ignore the padding bytes.
*/
len = ntohs(chunk->chunk_hdr->length);
- padding = WORD_ROUND(len) - len;
+ padding = SCTP_PAD4(len) - len;
/* Fixup cloned skb with just this chunks data. */
skb_trim(skb, chunk->chunk_end - padding - skb->data);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 877e550..84d0fda 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
* we can go ahead and clear out the lobby in one shot
*/
if (!skb_queue_empty(&sp->pd_lobby)) {
- struct list_head *list;
skb_queue_splice_tail_init(&sp->pd_lobby,
&sk->sk_receive_queue);
- list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
- INIT_LIST_HEAD(list);
return 1;
}
} else {
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1d28181..d858202 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,9 +569,10 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
struct rsc *found;
memset(&rsci, 0, sizeof(rsci));
- rsci.handle.data = handle->data;
- rsci.handle.len = handle->len;
+ if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+ return NULL;
found = rsc_lookup(cd, &rsci);
+ rsc_free(&rsci);
if (!found)
return NULL;
if (cache_check(cd, &found->h, NULL))
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 536d0be..799cce6 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
#include <linux/module.h> /* try_module_get()/module_put() */
@@ -923,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
}
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i < buf->rb_max_requests; i++) {
+ for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) {
struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
@@ -1018,6 +1019,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
rep = rpcrdma_buffer_get_rep_locked(buf);
rpcrdma_destroy_rep(ia, rep);
}
+ buf->rb_send_count = 0;
spin_lock(&buf->rb_reqslock);
while (!list_empty(&buf->rb_allreqs)) {
@@ -1032,6 +1034,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
+ buf->rb_recv_count = 0;
rpcrdma_destroy_mrs(buf);
}
@@ -1074,8 +1077,27 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
spin_unlock(&buf->rb_mwlock);
}
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
+{
+ /* If an RPC previously completed without a reply (say, a
+ * credential problem or a soft timeout occurs) then hold off
+ * on supplying more Receive buffers until the number of new
+ * pending RPCs catches up to the number of posted Receives.
+ */
+ if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
+ return NULL;
+
+ if (unlikely(list_empty(&buffers->rb_recv_bufs)))
+ return NULL;
+ buffers->rb_recv_count++;
+ return rpcrdma_buffer_get_rep_locked(buffers);
+}
+
/*
* Get a set of request/reply buffers.
+ *
+ * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -1085,21 +1107,15 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
spin_lock(&buffers->rb_lock);
if (list_empty(&buffers->rb_send_bufs))
goto out_reqbuf;
+ buffers->rb_send_count++;
req = rpcrdma_buffer_get_req_locked(buffers);
- if (list_empty(&buffers->rb_recv_bufs))
- goto out_repbuf;
- req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
return req;
out_reqbuf:
spin_unlock(&buffers->rb_lock);
- pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
- return NULL;
-out_repbuf:
- list_add(&req->rl_free, &buffers->rb_send_bufs);
- spin_unlock(&buffers->rb_lock);
- pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
+ pr_warn("RPC: %s: out of request buffers\n", __func__);
return NULL;
}
@@ -1117,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
+ buffers->rb_send_count--;
list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
- if (rep)
+ if (rep) {
+ buffers->rb_recv_count--;
list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ }
spin_unlock(&buffers->rb_lock);
}
@@ -1133,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
struct rpcrdma_buffer *buffers = req->rl_buffer;
spin_lock(&buffers->rb_lock);
- if (!list_empty(&buffers->rb_recv_bufs))
- req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
}
@@ -1148,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
spin_lock(&buffers->rb_lock);
+ buffers->rb_recv_count--;
list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
spin_unlock(&buffers->rb_lock);
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 670fad5..a71b0f5 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -321,6 +321,7 @@ struct rpcrdma_buffer {
char *rb_pool;
spinlock_t rb_lock; /* protect buf lists */
+ int rb_send_count, rb_recv_count;
struct list_head rb_send_bufs;
struct list_head rb_recv_bufs;
u32 rb_max_requests;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8ede3bc..bf16883 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1074,7 +1074,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
skb = skb_recv_datagram(sk, 0, 1, &err);
if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
+ skb_free_datagram_locked(sk, skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 10b8193..02beb35 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -21,7 +21,6 @@
#include <linux/workqueue.h>
#include <linux/if_vlan.h>
#include <linux/rtnetlink.h>
-#include <net/ip_fib.h>
#include <net/switchdev.h>
/**
@@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_VLAN:
return sizeof(struct switchdev_obj_port_vlan);
- case SWITCHDEV_OBJ_ID_IPV4_FIB:
- return sizeof(struct switchdev_obj_ipv4_fib);
case SWITCHDEV_OBJ_ID_PORT_FDB:
return sizeof(struct switchdev_obj_port_fdb);
case SWITCHDEV_OBJ_ID_PORT_MDB:
@@ -1108,184 +1105,6 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
}
EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
-static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
-{
- const struct switchdev_ops *ops = dev->switchdev_ops;
- struct net_device *lower_dev;
- struct net_device *port_dev;
- struct list_head *iter;
-
- /* Recusively search down until we find a sw port dev.
- * (A sw port dev supports switchdev_port_attr_get).
- */
-
- if (ops && ops->switchdev_port_attr_get)
- return dev;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- port_dev = switchdev_get_lowest_dev(lower_dev);
- if (port_dev)
- return port_dev;
- }
-
- return NULL;
-}
-
-static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
-{
- struct switchdev_attr attr = {
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- };
- struct switchdev_attr prev_attr;
- struct net_device *dev = NULL;
- int nhsel;
-
- ASSERT_RTNL();
-
- /* For this route, all nexthop devs must be on the same switch. */
-
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (!nh->nh_dev)
- return NULL;
-
- dev = switchdev_get_lowest_dev(nh->nh_dev);
- if (!dev)
- return NULL;
-
- attr.orig_dev = dev;
- if (switchdev_port_attr_get(dev, &attr))
- return NULL;
-
- if (nhsel > 0 &&
- !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
- return NULL;
-
- prev_attr = attr;
- }
-
- return dev;
-}
-
-/**
- * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry
- *
- * @dst: route's IPv4 destination address
- * @dst_len: destination address length (prefix length)
- * @fi: route FIB info structure
- * @tos: route TOS
- * @type: route type
- * @nlflags: netlink flags passed in (NLM_F_*)
- * @tb_id: route table ID
- *
- * Add/modify switch IPv4 route entry.
- */
-int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 nlflags, u32 tb_id)
-{
- struct switchdev_obj_ipv4_fib ipv4_fib = {
- .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
- .dst = dst,
- .dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .nlflags = nlflags,
- .tb_id = tb_id,
- };
- struct net_device *dev;
- int err = 0;
-
- /* Don't offload route if using custom ip rules or if
- * IPv4 FIB offloading has been disabled completely.
- */
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (fi->fib_net->ipv4.fib_has_custom_rules)
- return 0;
-#endif
-
- if (fi->fib_net->ipv4.fib_offload_disabled)
- return 0;
-
- dev = switchdev_get_dev_by_nhs(fi);
- if (!dev)
- return 0;
-
- ipv4_fib.obj.orig_dev = dev;
- err = switchdev_port_obj_add(dev, &ipv4_fib.obj);
- if (!err)
- fi->fib_flags |= RTNH_F_OFFLOAD;
-
- return err == -EOPNOTSUPP ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
-
-/**
- * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
- *
- * @dst: route's IPv4 destination address
- * @dst_len: destination address length (prefix length)
- * @fi: route FIB info structure
- * @tos: route TOS
- * @type: route type
- * @tb_id: route table ID
- *
- * Delete IPv4 route entry from switch device.
- */
-int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id)
-{
- struct switchdev_obj_ipv4_fib ipv4_fib = {
- .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
- .dst = dst,
- .dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .nlflags = 0,
- .tb_id = tb_id,
- };
- struct net_device *dev;
- int err = 0;
-
- if (!(fi->fib_flags & RTNH_F_OFFLOAD))
- return 0;
-
- dev = switchdev_get_dev_by_nhs(fi);
- if (!dev)
- return 0;
-
- ipv4_fib.obj.orig_dev = dev;
- err = switchdev_port_obj_del(dev, &ipv4_fib.obj);
- if (!err)
- fi->fib_flags &= ~RTNH_F_OFFLOAD;
-
- return err == -EOPNOTSUPP ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
-
-/**
- * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
- *
- * @fi: route FIB info structure
- */
-void switchdev_fib_ipv4_abort(struct fib_info *fi)
-{
- /* There was a problem installing this route to the offload
- * device. For now, until we come up with more refined
- * policy handling, abruptly end IPv4 fib offloading for
- * for entire net by flushing offload device(s) of all
- * IPv4 routes, and mark IPv4 fib offloading broken from
- * this point forward.
- */
-
- fib_flush_external(fi->fib_net);
- fi->fib_net->ipv4.fib_offload_disabled = true;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
-
bool switchdev_port_same_parent_id(struct net_device *a,
struct net_device *b)
{
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 5bc1a3d..e0c71bd 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -44,7 +44,7 @@ static int net_ctl_permissions(struct ctl_table_header *head,
struct net *net = container_of(head->set, struct net, sysctls);
/* Allow network administrator to have same access as root. */
- if (ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) {
int mode = (table->mode >> 6) & 7;
return (mode << 6) | (mode << 3) | mode;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 17dbbe6..8a398b3 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -465,6 +465,8 @@ void vsock_pending_work(struct work_struct *work)
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
+
+ listener->sk_ack_backlog--;
} else if (!vsk->rejected) {
/* We are not on the pending list and accept() did not reject
* us, so we must have been accepted by our user process. We
@@ -475,8 +477,6 @@ void vsock_pending_work(struct work_struct *work)
goto out;
}
- listener->sk_ack_backlog--;
-
/* We need to remove ourself from the global connected sockets list so
* incoming packets can't find this socket, and to reduce the reference
* count.
@@ -2010,5 +2010,5 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport);
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");
-MODULE_VERSION("1.0.1.0-k");
+MODULE_VERSION("1.0.2.0-k");
MODULE_LICENSE("GPL v2");
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 92eb6f0..c510810 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7353,7 +7353,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
params.n_counter_offsets_presp = len / sizeof(u16);
if (rdev->wiphy.max_num_csa_counters &&
- (params.n_counter_offsets_beacon >
+ (params.n_counter_offsets_presp >
rdev->wiphy.max_num_csa_counters))
return -EINVAL;
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 9c4fbd8..ba2b539 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -50,12 +50,18 @@ static const struct snmp_mib xfrm_mib_list[] = {
static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
{
+ unsigned long buff[LINUX_MIB_XFRMMAX];
struct net *net = seq->private;
int i;
+
+ memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX);
+
+ snmp_get_cpu_field_batch(buff, xfrm_mib_list,
+ net->mib.xfrm_statistics);
for (i = 0; xfrm_mib_list[i].name; i++)
seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
- snmp_fold_field(net->mib.xfrm_statistics,
- xfrm_mib_list[i].entry));
+ buff[i]);
+
return 0;
}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ba8bf51..419bf5d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -350,6 +350,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
tasklet_hrtimer_cancel(&x->mtimer);
del_timer_sync(&x->rtimer);
+ kfree(x->aead);
kfree(x->aalg);
kfree(x->ealg);
kfree(x->calg);
@@ -1431,9 +1432,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32
{
struct xfrm_state *x;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_lock();
x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_unlock();
return x;
}
EXPORT_SYMBOL(xfrm_state_lookup);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index cb65d91..0889209 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
if (err)
goto error;
- if (attrs[XFRMA_SEC_CTX] &&
- security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
- goto error;
+ if (attrs[XFRMA_SEC_CTX]) {
+ err = security_xfrm_state_alloc(x,
+ nla_data(attrs[XFRMA_SEC_CTX]));
+ if (err)
+ goto error;
+ }
if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
attrs[XFRMA_REPLAY_ESN_VAL])))
OpenPOWER on IntegriCloud