diff options
Diffstat (limited to 'net')
113 files changed, 3509 insertions, 2071 deletions
diff --git a/net/Kconfig b/net/Kconfig index 57a7c5a..7021c1b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -374,6 +374,13 @@ source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +config LWTUNNEL + bool "Network light weight tunnels" + ---help--- + This feature provides an infrastructure to support light weight + tunnels like mpls. There is no netdevice associated with a light + weight tunnel endpoint. Tunnel encapsulation parameters are stored + with light weight tunnel state associated with fib routes. endif # if NET diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a538cb1..45e4757 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -281,6 +281,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) br_fdb_delete_by_port(br, NULL, 0, 1); br_vlan_flush(br); + br_multicast_dev_del(br); del_timer_sync(&br->gc_timer); br_sysfs_delbr(br->dev); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1198a3d..6a591e6 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, memset(&e, 0, sizeof(e)); e.ifindex = port->dev->ifindex; e.state = p->state; + e.vid = p->addr.vid; if (p->addr.proto == htons(ETH_P_IP)) e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) @@ -230,7 +231,7 @@ errout: } void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type) + struct br_ip *group, int type, u8 state) { struct br_mdb_entry entry; @@ -241,6 +242,8 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, #if IS_ENABLED(CONFIG_IPV6) entry.addr.u.ip6 = group->u.ip6; #endif + entry.state = state; + entry.vid = group->vid; __br_mdb_notify(dev, &entry, type); } @@ -263,6 +266,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry) return false; if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) return false; + if (entry->vid >= VLAN_VID_MASK) + return false; return true; } @@ -374,6 +379,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, return -EINVAL; memset(&ip, 0, sizeof(ip)); + ip.vid = entry->vid; ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; @@ -421,6 +427,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) return -EINVAL; memset(&ip, 0, sizeof(ip)); + ip.vid = entry->vid; ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 79db489..ed5dc68 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -283,6 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); + br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, + p->state); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && @@ -704,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br, if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB); + br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -924,6 +926,15 @@ void br_multicast_add_port(struct net_bridge_port *port) void br_multicast_del_port(struct net_bridge_port *port) { + struct net_bridge *br = port->br; + struct net_bridge_port_group *pg; + struct hlist_node *n; + + /* Take care of the remaining groups, only perm ones should be left */ + spin_lock_bh(&br->multicast_lock); + hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) + br_multicast_del_pg(br, pg); + spin_unlock_bh(&br->multicast_lock); del_timer_sync(&port->multicast_router_timer); } @@ -963,7 +974,8 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - br_multicast_del_pg(br, pg); + if (pg->state == MDB_TEMPORARY) + br_multicast_del_pg(br, pg); if (!hlist_unhashed(&port->rlist)) hlist_del_init_rcu(&port->rlist); @@ -1462,8 +1474,9 @@ br_multicast_leave_group(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); + br_mdb_notify(br->dev, port, group, RTM_DELMDB, + p->state); call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, port, group, RTM_DELMDB); if (!mp->ports && !mp->mglist && netif_running(br->dev)) @@ -1752,12 +1765,6 @@ void br_multicast_open(struct net_bridge *br) void br_multicast_stop(struct net_bridge *br) { - struct net_bridge_mdb_htable *mdb; - struct net_bridge_mdb_entry *mp; - struct hlist_node *n; - u32 ver; - int i; - del_timer_sync(&br->multicast_router_timer); del_timer_sync(&br->ip4_other_query.timer); del_timer_sync(&br->ip4_own_query.timer); @@ -1765,6 +1772,15 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif +} + +void br_multicast_dev_del(struct net_bridge *br) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct hlist_node *n; + u32 ver; + int i; spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8b21146..3ad1290 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -466,6 +466,7 @@ void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); +void br_multicast_dev_del(struct net_bridge *br); void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb); void br_multicast_forward(struct net_bridge_mdb_entry *mdst, @@ -488,7 +489,7 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, void br_mdb_init(void); void br_mdb_uninit(void); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type); + struct br_ip *group, int type, u8 state); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -565,6 +566,10 @@ static inline void br_multicast_stop(struct net_bridge *br) { } +static inline void br_multicast_dev_del(struct net_bridge *br) +{ +} + static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb) { diff --git a/net/core/Makefile b/net/core/Makefile index fec0856..086b01f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/net/core/dev.c b/net/core/dev.c index a8e4dd4..cb52cba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -3645,7 +3655,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, qdisc_skb_cb(skb)->pkt_len = skb->len; skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - qdisc_bstats_update_cpu(cl->q, skb); + qdisc_bstats_cpu_update(cl->q, skb); switch (tc_classify(skb, cl, &cl_res)) { case TC_ACT_OK: @@ -3653,7 +3663,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: - qdisc_qstats_drop_cpu(cl->q); + qdisc_qstats_cpu_drop(cl->q); case TC_ACT_STOLEN: case TC_ACT_QUEUED: kfree_skb(skb); @@ -6075,6 +6085,26 @@ int dev_get_phys_port_name(struct net_device *dev, EXPORT_SYMBOL(dev_get_phys_port_name); /** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -7639,7 +7669,7 @@ static int __init net_dev_init(void) open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dst.c b/net/core/dst.c index 002144be..76a617f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -22,6 +22,7 @@ #include <linux/prefetch.h> #include <net/dst.h> +#include <net/dst_metadata.h> /* * Theory of operations: @@ -158,19 +159,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = { [RTAX_MAX] = 0xdeadbeef, }; - -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags) { - struct dst_entry *dst; - - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) - return NULL; - } - dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); - if (!dst) - return NULL; dst->child = NULL; dst->dev = dev; if (dev) @@ -200,6 +192,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); +} +EXPORT_SYMBOL(dst_init); + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, unsigned short flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + return dst; } EXPORT_SYMBOL(dst_alloc); @@ -248,7 +259,11 @@ again: dst->ops->destroy(dst); if (dst->dev) dev_put(dst->dev); - kmem_cache_free(dst->ops->kmem_cachep, dst); + + if (dst->flags & DST_METADATA) + kfree(dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) { @@ -329,6 +344,47 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); +static struct dst_ops md_dst_ops = { + .family = AF_UNSPEC, +}; + +static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call output on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static int dst_md_discard(struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call input on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + struct dst_entry *dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return ERR_PTR(-ENOMEM); + + dst = &md_dst->dst; + dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + + dst->input = dst_md_discard; + dst->output = dst_md_discard_sk; + + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->opts_len = optslen; + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice @@ -393,7 +449,7 @@ static struct notifier_block dst_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; -void __init dst_init(void) +void __init dst_subsys_init(void) { register_netdevice_notifier(&dst_dev_notifier); } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a12668..ae8306e 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -16,6 +16,7 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_rules.h> +#include <net/ip_tunnels.h> int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) @@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_FWMASK]) rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + if (tb[FRA_TUN_ID]) + rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (unresolved) ops->unresolved_rules++; + if (rule->tun_id) + ip_tunnel_need_metadata(); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) continue; + if (tb[FRA_TUN_ID] && + (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) goto errout; } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { @@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ - + nla_total_size(4); /* FRA_FWMASK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && - nla_put_u32(skb, FRA_GOTO, rule->target))) + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index be3098f..786722a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include <linux/if_vlan.h> #include <linux/bpf.h> #include <net/sch_generic.h> +#include <net/cls_cgroup.h> /** * sk_filter - run a packet through a socket filter @@ -1424,6 +1425,64 @@ const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return task_get_classid((struct sk_buff *) (unsigned long) r1); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { + .func = bpf_get_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 vlan_proto = (__force __be16) r2; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + return skb_vlan_push(skb, vlan_proto, vlan_tci); +} + +const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); + +static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + + return skb_vlan_pop(skb); +} + +const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); + +bool bpf_helper_changes_skb_data(void *func) +{ + if (func == bpf_skb_vlan_push) + return true; + if (func == bpf_skb_vlan_pop) + return true; + return false; +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1461,6 +1520,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; default: return sk_filter_func_proto(func_id); } diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 0000000..bb58826 --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,235 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/lwtunnel.h> +#include <linux/in.h> +#include <linux/init.h> +#include <linux/err.h> + +#include <net/lwtunnel.h> +#include <net/rtnetlink.h> +#include <net/ip6_fib.h> + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_cmp_encap); + +int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (!lwtstate) + goto drop; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(sk, skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree(skb); + + return ret; +} + +int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) +{ + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt6i_lwtstate; + + return __lwtunnel_output(sk, skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_output6); + +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt_lwtstate; + + return __lwtunnel_output(sk, skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_output); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 18b34d7..194c1d0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool) proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dc004b1..788ceed 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -896,7 +896,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(1); /* IFLA_PROTO_DOWN */ + } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1082,7 +1084,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, (dev->ifalias && nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes))) + atomic_read(&dev->carrier_changes)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; if (1) { @@ -1319,6 +1322,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1861,6 +1865,14 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; + if (tb[IFLA_PROTO_DOWN]) { + err = dev_change_proto_down(dev, + nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) @@ -1951,16 +1963,30 @@ static int rtnl_group_dellink(const struct net *net, int group) return 0; } +int rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; - LIST_HEAD(list_kill); err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1982,13 +2008,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev) return -ENODEV; - ops = dev->rtnl_link_ops; - if (!ops || !ops->dellink) - return -EOPNOTSUPP; - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return 0; + return rtnl_delete_link(dev); } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 43d3dd6..42689d5 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; + __skb_push(skb, ETH_HLEN); - type = classify(skb); + type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 933a928..1d59e50 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -291,6 +291,40 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(skb); } +/* Create and send an arp packet. */ +static void arp_send_dst(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, struct sk_buff *oskb) +{ + struct sk_buff *skb; + + /* arp on this interface. */ + if (dev->flags & IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + if (oskb) + skb_dst_copy(skb, oskb); + + arp_xmit(skb); +} + +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, const unsigned char *src_hw, + const unsigned char *target_hw) +{ + arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, + target_hw, NULL); +} +EXPORT_SYMBOL(arp_send); + static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; @@ -346,8 +380,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } - arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL); + arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_hw, dev->dev_addr, NULL, + dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -597,32 +632,6 @@ void arp_xmit(struct sk_buff *skb) EXPORT_SYMBOL(arp_xmit); /* - * Create and send an arp packet. - */ -void arp_send(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw) -{ - struct sk_buff *skb; - - /* - * No arp on this interface. - */ - - if (dev->flags&IFF_NOARP) - return; - - skb = arp_create(type, ptype, dest_ip, dev, src_ip, - dest_hw, src_hw, target_hw); - if (!skb) - return; - - arp_xmit(skb); -} -EXPORT_SYMBOL(arp_send); - -/* * Process an arp request. */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6bbc549..6b98de0 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,6 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + fl4.flowi4_tun_key.tun_id = 0; if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { @@ -313,6 +314,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_tun_key.tun_id = 0; no_addr = idev->ifa_list == NULL; @@ -591,6 +593,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -656,6 +660,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; + case RTA_ENCAP: + cfg->fc_encap = attr; + break; + case RTA_ENCAP_TYPE: + cfg->fc_encap_type = nla_get_u16(attr); + break; } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c7358ea..6754c64 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -42,6 +42,7 @@ #include <net/ip_fib.h> #include <net/netlink.h> #include <net/nexthop.h> +#include <net/lwtunnel.h> #include "fib_lookup.h" @@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + lwtunnel_state_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif + lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; @@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->fib_nhs) { + size_t nh_encapsize = 0; /* Also handles the special case fib_nhs == 1 */ /* each nexthop is packed in an attribute */ @@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); + /* grab encap info */ + for_nexthops(fi) { + if (nh->nh_lwtstate) { + /* RTA_ENCAP_TYPE */ + nh_encapsize += lwtunnel_get_encap_size( + nh->nh_lwtstate); + /* RTA_ENCAP */ + nh_encapsize += nla_total_size(2); + } + } endfor_nexthops(fi); + /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); + payload += nla_total_size((fi->fib_nhs * nhsize) + + nh_encapsize); + } return payload; @@ -452,6 +469,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; + int ret; + change_nexthops(fi) { int attrlen; @@ -475,18 +495,66 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (nexthop_nh->nh_tclassid) fi->fib_net->ipv4.fib_num_tclassid_users++; #endif + nla = nla_find(attrs, attrlen, RTA_ENCAP); + if (nla) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + struct nlattr *nla_entype; + + nla_entype = nla_find(attrs, attrlen, + RTA_ENCAP_TYPE); + if (!nla_entype) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + ret = lwtunnel_build_state(dev, nla_get_u16( + nla_entype), + nla, &lwtstate); + if (ret) + goto errout; + lwtunnel_state_get(lwtstate); + nexthop_nh->nh_lwtstate = lwtstate; + } } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); return 0; + +err_inval: + ret = -EINVAL; + +errout: + return ret; } #endif +int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh) +{ + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE) + return 0; + + if (oif) + dev = __dev_get_by_index(net, oif); + ret = lwtunnel_build_state(dev, encap_type, + encap, &lwtstate); + if (!ret) + return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + + return 0; +} + int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { + struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -496,6 +564,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_encap) { + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, cfg->fc_oif, + fi->fib_nh)) + return 1; + } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; @@ -882,6 +956,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else { struct fib_nh *nh = fi->fib_nh; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto failure; + + lwtunnel_state_get(lwtstate); + nh->nh_lwtstate = lwtstate; + } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; @@ -1055,6 +1145,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif + if (fi->fib_nh->nh_lwtstate) + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1090,6 +1182,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif + if (nh->nh_lwtstate) + lwtunnel_fill_encap(skb, nh->nh_lwtstate); /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f5203fb..c0556f1 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -496,6 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ + skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, RT_TOS(tos), rt2->dst.dev); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0cb9165..8912019 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -343,7 +343,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -371,21 +370,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; @@ -403,13 +398,12 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; - int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); @@ -420,23 +414,22 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) spin_lock(lock); __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); + if (osk) { + WARN_ON(sk->sk_hash != osk->sk_hash); + sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) +void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, tw); + return __inet_hash_nolisten(sk, osk); WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -445,7 +438,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); - return 0; } EXPORT_SYMBOL(__inet_hash); @@ -492,7 +484,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); - int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -560,19 +551,14 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += __inet_hash_nolisten(sk, tw); + __inet_hash_nolisten(sk, (struct sock *)tw); } if (tw) - twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); - if (tw) { - inet_twsk_deschedule(tw); - while (twrefcnt) { - twrefcnt--; - inet_twsk_put(tw); - } - } + if (tw) + inet_twsk_deschedule_put(tw); ret = 0; goto out; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2ffbd16..ae22cc2 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -18,28 +18,6 @@ /** - * inet_twsk_unhash - unhash a timewait socket from established hash - * @tw: timewait socket - * - * unhash a timewait socket from established hash, if hashed. - * ehash lock must be held by caller. - * Returns 1 if caller should call inet_twsk_put() after lock release. - */ -int inet_twsk_unhash(struct inet_timewait_sock *tw) -{ - if (hlist_nulls_unhashed(&tw->tw_node)) - return 0; - - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; -} - -/** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket * @hashinfo: hashinfo pointer @@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw) * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ -int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, +void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) - return 0; + return; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; + __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; - struct inet_bind_hashbucket *bhead; - int refcnt; - /* Unlink from established hashes. */ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + struct inet_bind_hashbucket *bhead; spin_lock(lock); - refcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ @@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) hashinfo->bhash_size)]; spin_lock(&bhead->lock); - refcnt += inet_twsk_bind_unhash(tw, hashinfo); + inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); - atomic_sub(refcnt, &tw->tw_refcnt); atomic_dec(&tw->tw_dr->tw_count); inet_twsk_put(tw); } @@ -235,13 +205,17 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); * tcp_input.c to verify this. */ -/* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw) +/* This is for handling early-kills of TIME_WAIT sockets. + * Warning : consume reference. + * Caller should not access tw anymore. + */ +void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { if (del_timer_sync(&tw->tw_timer)) inet_twsk_kill(tw); + inet_twsk_put(tw); } -EXPORT_SYMBOL(inet_twsk_deschedule); +EXPORT_SYMBOL(inet_twsk_deschedule_put); void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) { @@ -311,9 +285,8 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw); + inet_twsk_deschedule_put(tw); local_bh_enable(); - inet_twsk_put(tw); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 31f71b1..f44bccc 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -522,7 +522,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; - int sum_truesize; u8 ecn; ipq_kill(qp); @@ -590,32 +589,19 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, add_frag_mem_limit(&qp->q, clone->truesize); } + skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; - - sum_truesize += fp->truesize; + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; + head->truesize += fp->truesize; } - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(&qp->q, head->truesize); head->next = NULL; head->dev = dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2db4c87..f4fc8a7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,6 +146,7 @@ #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> +#include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) @@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (!skb_dst(skb)) { + if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6a51a71..630e6d5 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -32,6 +32,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> +#include <linux/static_key.h> #include <net/ip.h> #include <net/icmp.h> @@ -190,3 +191,132 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); + +static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = { + [IP_TUN_ID] = { .type = NLA_U64 }, + [IP_TUN_DST] = { .type = NLA_U32 }, + [IP_TUN_SRC] = { .type = NLA_U32 }, + [IP_TUN_TTL] = { .type = NLA_U8 }, + [IP_TUN_TOS] = { .type = NLA_U8 }, + [IP_TUN_SPORT] = { .type = NLA_U16 }, + [IP_TUN_DPORT] = { .type = NLA_U16 }, + [IP_TUN_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[IP_TUN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP; + + tun_info = lwt_tun_info(new_state); + + if (tb[IP_TUN_ID]) + tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]); + + if (tb[IP_TUN_DST]) + tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]); + + if (tb[IP_TUN_SRC]) + tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]); + + if (tb[IP_TUN_TTL]) + tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]); + + if (tb[IP_TUN_TOS]) + tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]); + + if (tb[IP_TUN_SPORT]) + tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]); + + if (tb[IP_TUN_DPORT]) + tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]); + + if (tb[IP_TUN_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options = NULL; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) || + nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) || + nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) || + nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) || + nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) || + nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* IP_TUN_ID */ + + nla_total_size(4) /* IP_TUN_DST */ + + nla_total_size(4) /* IP_TUN_SRC */ + + nla_total_size(1) /* IP_TUN_TOS */ + + nla_total_size(1) /* IP_TUN_TTL */ + + nla_total_size(2) /* IP_TUN_SPORT */ + + nla_total_size(2) /* IP_TUN_DPORT */ + + nla_total_size(2); /* IP_TUN_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { + .build_state = ip_tun_build_state, + .fill_encap = ip_tun_fill_encap_info, + .get_encap_size = ip_tun_encap_nlsize, +}; + +static int __init ip_tunnel_core_init(void) +{ + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + + return 0; +} +module_init(ip_tunnel_core_init); + +static void __exit ip_tunnel_core_exit(void) +{ + lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); +} +module_exit(ip_tunnel_core_exit); + +struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(ip_tunnel_metadata_cnt); + +void ip_tunnel_need_metadata(void) +{ + static_key_slow_inc(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); + +void ip_tunnel_unneed_metadata(void) +{ + static_key_slow_dec(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 05ff44b..e89094a 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(isk->freebind || isk->transparent || has_addr || + if (!(net->ipv6.sysctl.ip_nonlocal_bind || + isk->freebind || isk->transparent || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index da5d483..3abd9d7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -300,6 +300,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), + SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), + SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d0362a2..519ec23 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> +#include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> @@ -102,6 +103,7 @@ #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> +#include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL @@ -109,6 +111,7 @@ #include <linux/kmemleak.h> #endif #include <net/secure_seq.h> +#include <net/ip_tunnels.h> #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1355,6 +1358,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } + lwtunnel_state_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1403,6 +1407,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif + if (nh->nh_lwtstate) { + lwtunnel_state_get(nh->nh_lwtstate); + rt->rt_lwtstate = nh->nh_lwtstate; + } else { + rt->rt_lwtstate = NULL; + } if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1488,6 +1498,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1546,7 +1557,6 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - unsigned int flags = 0; bool do_cache; u32 itag = 0; @@ -1610,7 +1620,7 @@ static int __mkroute_input(struct sk_buff *skb, } rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = flags; + rth->rt_flags = 0; rth->rt_type = res->type; rth->rt_is_input = 1; rth->rt_iif = 0; @@ -1618,12 +1628,15 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + if (lwtunnel_output_redirect(rth->rt_lwtstate)) + rth->dst.output = lwtunnel_output; skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1662,6 +1675,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); + struct ip_tunnel_info *tun_info; struct flowi4 fl4; unsigned int flags = 0; u32 itag = 0; @@ -1679,6 +1693,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + tun_info = skb_tunnel_info(skb, AF_INET); + if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; + else + fl4.flowi4_tun_key.tun_id = 0; + skb_dst_drop(skb); + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; @@ -1792,6 +1813,8 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1981,7 +2004,7 @@ add: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - + rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2261,7 +2284,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - + rt->rt_lwtstate = NULL; dst_free(new); } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index c037644..fd1405d 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 8c6fd3d..167b6a3 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -264,7 +264,7 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) u32 prior_snd_cwnd; u32 incr; - if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + if (tcp_in_slow_start(tp) && hystart_detect) tcp_cdg_hystart_update(sk); if (after(ack, ca->rtt_seq) && ca->rtt.v64) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 84be008..a2ed23c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -365,10 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) */ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - u32 cwnd = tp->snd_cwnd + acked; + u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - if (cwnd > tp->snd_ssthresh) - cwnd = tp->snd_ssthresh + 1; acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); @@ -413,7 +411,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 06d3d66..28011fb 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); @@ -439,7 +439,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tp->snd_cwnd <= tp->snd_ssthresh && + if (hystart && tcp_in_slow_start(tp) && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 882c08a..db78424 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* Update AIMD parameters. diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 58469ff..82f0d9e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* In dangerous area, increase slowly. diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index f963b27..083831e 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) rho_fractions = ca->rho_3ls - (ca->rho << 3); - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* * slow start * INC = 2^RHO - 1 diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index f71002e..2ab9bbb 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In slow start */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 728f5b3..4e4d6bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -196,11 +197,13 @@ static void tcp_enter_quickack_mode(struct sock *sk) * and the session is not interactive. */ -static inline bool tcp_in_quickack_mode(const struct sock *sk) +static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); - return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; + return (dst && dst_metric(dst, RTAX_QUICKACK)) || + (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -1037,7 +1040,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, * highest SACK block). Also calculate the lowest snd_nxt among the remaining * retransmitted skbs to avoid some costly processing per ACKs. */ -static void tcp_mark_lost_retrans(struct sock *sk) +static void tcp_mark_lost_retrans(struct sock *sk, int *flag) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -1078,7 +1081,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) if (after(received_upto, ack_seq)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - + *flag |= FLAG_LOST_RETRANS; tcp_skb_mark_lost_uncond_verify(tp, skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { @@ -1818,7 +1821,7 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk); + tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: @@ -2474,15 +2477,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) return false; } -/* The cwnd reduction in CWR and Recovery use the PRR algorithm - * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ +/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. - * 2) If packets in flight is lower than ssthresh (such as due to excess - * losses and/or application stalls), do not perform any further cwnd - * reductions, but instead slow start up to ssthresh. + * 2) Otherwise PRR uses packet conservation to send as much as delivered. + * But when the retransmits are acked without further losses, PRR + * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { @@ -2499,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit) + int fast_rexmit, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2508,16 +2510,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, (tp->packets_out - tp->sacked_out); tp->prr_delivered += newly_acked_sacked; - if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else { + } else if ((flag & FLAG_RETRANS_DATA_ACKED) && + !(flag & FLAG_LOST_RETRANS)) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); + } else { + sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2578,7 +2582,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2588,6 +2592,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) @@ -2607,6 +2612,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Do a simple retransmit without using the backoff mechanisms in @@ -2675,7 +2681,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tcp_init_undo(tp); - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!tcp_in_cwnd_reduction(sk)) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); tcp_init_cwnd_reduction(sk); @@ -2735,7 +2741,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked) + const int prior_unsacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2751,7 +2757,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * mark more packets lost or retransmit more. */ if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); return true; } @@ -2838,7 +2844,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2851,9 +2857,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); - if (icsk->icsk_ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open && + !(flag & FLAG_LOST_RETRANS)) return; - /* Fall through to processing in Open state. */ + /* Change state if cwnd is undone or retransmits are lost */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -2888,7 +2895,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); tcp_xmit_retransmit_queue(sk); } @@ -3562,10 +3569,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) &sack_state); acked -= tp->packets_out; - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, prior_unsacked, @@ -3574,6 +3577,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, acked); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3947,7 +3954,6 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3959,9 +3965,7 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - dst = __sk_dst_get(sk); - if (!dst || !dst_metric(dst, RTAX_QUICKACK)) - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d7d4c2b..486ba96 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1683,8 +1683,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; goto process; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index a51d63a..b3d64f6 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -461,7 +461,7 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, tp->snd_cwnd); } - } else if (tp->snd_cwnd > tp->snd_ssthresh && + } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4bc00cb..6d8795b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_RST; } @@ -198,8 +197,7 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b1c218d..7105784 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -163,7 +163,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -174,9 +173,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && - (!dst || !dst_metric(dst, RTAX_QUICKACK))) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 333bcb2..bf5ea9e 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5b752f5..7149ebc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -649,4 +649,3 @@ void tcp_init_xmit_timers(struct sock *sk) inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } -EXPORT_SYMBOL(tcp_init_xmit_timers); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a6cea1d..13951c4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; - if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { + if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ @@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); - } else if (tp->snd_cwnd <= tp->snd_ssthresh) { + } else if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { @@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ - else if (tp->snd_cwnd <= tp->snd_ssthresh) + else if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); } diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 112151e..0d094b9 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 438a73a..643f613 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -5,16 +5,15 @@ # IPv6 as module will cause a CRASH if you try to unload it menuconfig IPV6 tristate "The IPv6 protocol" - default m + default y ---help--- - This is complemental support for the IP version 6. - You will still be able to do traditional IPv4 networking as well. + Support for IP version 6 (IPv6). For general information about IPv6, see <https://en.wikipedia.org/wiki/IPv6>. - For Linux IPv6 development information, see <http://www.linux-ipv6.org>. - For specific information about IPv6 under Linux, read the HOWTO at - <http://www.bieringer.de/linux/IPv6/>. + For specific information about IPv6 under Linux, see + Documentation/networking/ipv6.txt and read the HOWTO at + <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> To compile this protocol support as a module, choose M here: the module will be called ipv6. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 21c2c81..eb0c6a3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_mtu = 1, .stable_secret = { .initialized = false, - } + }, + .use_oif_addrs_only = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .stable_secret = { .initialized = false, }, + .use_oif_addrs_only = 0, }; /* Check if a valid qdisc is available */ @@ -1358,15 +1360,96 @@ out: return ret; } +static int __ipv6_dev_get_saddr(struct net *net, + struct ipv6_saddr_dst *dst, + struct inet6_dev *idev, + struct ipv6_saddr_score *scores, + int hiscore_idx) +{ + struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; + + read_lock_bh(&idev->lock); + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + int i; + + /* + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense, unless it is also flagged as optimistic. + * - Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if ((score->ifa->flags & IFA_F_TENTATIVE) && + (!(score->ifa->flags & IFA_F_OPTIMISTIC))) + continue; + + score->addr_type = __ipv6_addr_type(&score->ifa->addr); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) { + net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", + idev->dev->name); + continue; + } + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i); + miniscore = ipv6_get_saddr_eval(net, score, dst, i); + + if (minihiscore > miniscore) { + if (i == IPV6_SADDR_RULE_SCOPE && + score->scopedist > 0) { + /* + * special case: + * each remaining entry + * has too small (not enough) + * scope, because ifa entries + * are sorted by their scope + * values. + */ + goto out; + } + break; + } else if (minihiscore < miniscore) { + if (hiscore->ifa) + in6_ifa_put(hiscore->ifa); + + in6_ifa_hold(score->ifa); + + swap(hiscore, score); + hiscore_idx = 1 - hiscore_idx; + + /* restore our iterator */ + score->ifa = hiscore->ifa; + + break; + } + } + } +out: + read_unlock_bh(&idev->lock); + return hiscore_idx; +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct ipv6_saddr_score scores[2], - *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_score scores[2], *hiscore; struct ipv6_saddr_dst dst; + struct inet6_dev *idev; struct net_device *dev; int dst_type; + bool use_oif_addr = false; + int hiscore_idx = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1375,105 +1458,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); dst.prefs = prefs; - hiscore->rule = -1; - hiscore->ifa = NULL; + scores[hiscore_idx].rule = -1; + scores[hiscore_idx].ifa = NULL; rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - struct inet6_dev *idev; - - /* Candidate Source Address (section 4) - * - multicast and link-local destination address, - * the set of candidate source address MUST only - * include addresses assigned to interfaces - * belonging to the same link as the outgoing - * interface. - * (- For site-local destination addresses, the - * set of candidate source addresses MUST only - * include addresses assigned to interfaces - * belonging to the same site as the outgoing - * interface.) - */ - if (((dst_type & IPV6_ADDR_MULTICAST) || - dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && - dst.ifindex && dev->ifindex != dst.ifindex) - continue; - - idev = __in6_dev_get(dev); - if (!idev) - continue; - - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { - int i; - - /* - * - Tentative Address (RFC2462 section 5.4) - * - A tentative address is not considered - * "assigned to an interface" in the traditional - * sense, unless it is also flagged as optimistic. - * - Candidate Source Address (section 4) - * - In any case, anycast addresses, multicast - * addresses, and the unspecified address MUST - * NOT be included in a candidate set. - */ - if ((score->ifa->flags & IFA_F_TENTATIVE) && - (!(score->ifa->flags & IFA_F_OPTIMISTIC))) - continue; - - score->addr_type = __ipv6_addr_type(&score->ifa->addr); + /* Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + * - "It is RECOMMENDED that the candidate source addresses + * be the set of unicast addresses assigned to the + * interface that will be used to send to the destination + * (the 'outgoing' interface)." (RFC 6724) + */ + if (dst_dev) { + idev = __in6_dev_get(dst_dev); + if ((dst_type & IPV6_ADDR_MULTICAST) || + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL || + (idev && idev->cnf.use_oif_addrs_only)) { + use_oif_addr = true; + } + } - if (unlikely(score->addr_type == IPV6_ADDR_ANY || - score->addr_type & IPV6_ADDR_MULTICAST)) { - net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", - dev->name); + if (use_oif_addr) { + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); + } else { + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (!idev) continue; - } - - score->rule = -1; - bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); - - for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { - int minihiscore, miniscore; - - minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); - miniscore = ipv6_get_saddr_eval(net, score, &dst, i); - - if (minihiscore > miniscore) { - if (i == IPV6_SADDR_RULE_SCOPE && - score->scopedist > 0) { - /* - * special case: - * each remaining entry - * has too small (not enough) - * scope, because ifa entries - * are sorted by their scope - * values. - */ - goto try_nextdev; - } - break; - } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - - swap(hiscore, score); - - /* restore our iterator */ - score->ifa = hiscore->ifa; - - break; - } - } + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } -try_nextdev: - read_unlock_bh(&idev->lock); } rcu_read_unlock(); + hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) return -EADDRNOTAVAIL; @@ -4586,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; /* we omit DEVCONF_STABLE_SECRET for now */ + array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } static inline size_t inet6_ifla6_size(void) @@ -5585,6 +5614,14 @@ static struct addrconf_sysctl_table .proc_handler = addrconf_sysctl_stable_secret, }, { + .procname = "use_oif_addrs_only", + .data = &ipv6_devconf.use_oif_addrs_only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, + { /* sentinel */ } }, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7de52b6..7bc92ea 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -342,7 +342,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!(inet->freebind || inet->transparent) && + if (!net->ipv6.sysctl.ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; @@ -679,8 +680,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct ipv6_pinfo *np = inet6_sk(sk); if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || - np->rxopt.bits.ohopopts)) || + if (((opt->flags & IP6SKB_HOPBYHOP) && + (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index b10a889..2572a32 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -568,8 +568,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } /* HbH is allowed only once */ - if (np->rxopt.bits.hopopts && opt->hop) { - u8 *ptr = nh + opt->hop; + if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { + u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } @@ -630,8 +630,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } - if (np->rxopt.bits.ohopopts && opt->hop) { - u8 *ptr = nh + opt->hop; + if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { + u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index a7bbbe4..ce203b0 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb) return -1; } - opt->hop = sizeof(struct ipv6hdr); + opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b4fd96d..6ac8dad 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -207,7 +207,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -234,21 +233,17 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 55d1986..d715f2e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/lwtunnel.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> @@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { + lwtunnel_state_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d5f7716..c5fc852 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1023,6 +1023,8 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; + if (!fl6->flowi6_oif) + fl6->flowi6_oif = dst->dev->ifindex; return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ca4700c..fdbada156 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; - if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!(addr_type & IPV6_ADDR_MULTICAST) && + !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) { err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6090969..7f2214f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -58,6 +58,7 @@ #include <net/netevent.h> #include <net/netlink.h> #include <net/nexthop.h> +#include <net/lwtunnel.h> #include <asm/uaccess.h> @@ -1770,6 +1771,18 @@ int ip6_route_add(struct fib6_config *cfg) rt->dst.output = ip6_output; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto out; + lwtunnel_state_get(lwtstate); + rt->rt6i_lwtstate = lwtstate; + rt->dst.output = lwtunnel_output6; + } + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) @@ -2595,6 +2608,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2689,6 +2704,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= RTF_PREF(pref); } + if (tb[RTA_ENCAP]) + cfg->fc_encap = tb[RTA_ENCAP]; + + if (tb[RTA_ENCAP_TYPE]) + cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = 0; errout: return err; @@ -2721,6 +2742,10 @@ beginning: r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); } err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); if (err) { @@ -2783,7 +2808,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(void) +static inline size_t rt6_nlmsg_size(struct rt6_info *rt) { return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ @@ -2797,7 +2822,8 @@ static inline size_t rt6_nlmsg_size(void) + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ - + nla_total_size(1); /* RTA_PREF */ + + nla_total_size(1) /* RTA_PREF */ + + lwtunnel_get_encap_size(rt->rt6i_lwtstate); } static int rt6_fill_node(struct net *net, @@ -2945,6 +2971,8 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; + lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + nlmsg_end(skb, nlh); return 0; @@ -3071,7 +3099,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 4e705ad..db48aeb 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -75,6 +75,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -117,6 +124,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries; ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; + ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6748c42..d540846 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1481,8 +1481,7 @@ do_time_wait: ntohs(th->dest), tcp_v6_iif(skb)); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); goto process; diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 17bde79..5c467ef 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,7 +24,13 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" - help + ---help--- Add support for forwarding of mpls packets. +config MPLS_IPTUNNEL + tristate "MPLS: IP over MPLS tunnel support" + depends on LWTUNNEL && MPLS_ROUTING + ---help--- + mpls ip tunnel support. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 65bbe68..9ca9236 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o +obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o mpls_router-y := af_mpls.o diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f93a59..49f1b0e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -15,6 +15,7 @@ #include <net/ip_fib.h> #include <net/netevent.h> #include <net/netns/generic.h> +#include <net/ip6_route.h> #include "internal.h" #define LABEL_NOT_SPECIFIED (1<<20) @@ -58,10 +59,11 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->mpls_ptr); } -static bool mpls_output_possible(const struct net_device *dev) +bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } +EXPORT_SYMBOL_GPL(mpls_output_possible); static unsigned int mpls_rt_header_size(const struct mpls_route *rt) { @@ -69,13 +71,14 @@ static unsigned int mpls_rt_header_size(const struct mpls_route *rt) return rt->rt_labels * sizeof(struct mpls_shim_hdr); } -static unsigned int mpls_dev_mtu(const struct net_device *dev) +unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } +EXPORT_SYMBOL_GPL(mpls_dev_mtu); -static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; @@ -85,6 +88,7 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) return true; } +EXPORT_SYMBOL_GPL(mpls_pkt_too_big); static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) @@ -327,6 +331,70 @@ static unsigned find_free_label(struct net *net) return LABEL_NOT_SPECIFIED; } +static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) +{ + struct net_device *dev = NULL; + struct rtable *rt; + struct in_addr daddr; + + memcpy(&daddr, addr, sizeof(struct in_addr)); + rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); + if (IS_ERR(rt)) + goto errout; + + dev = rt->dst.dev; + dev_hold(dev); + + ip_rt_put(rt); + +errout: + return dev; +} + +static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) +{ + struct net_device *dev = NULL; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto errout; + + dev = dst->dev; + dev_hold(dev); + +errout: + dst_release(dst); + + return dev; +} + +static struct net_device *find_outdev(struct net *net, + struct mpls_route_config *cfg) +{ + struct net_device *dev = NULL; + + if (!cfg->rc_ifindex) { + switch (cfg->rc_via_table) { + case NEIGH_ARP_TABLE: + dev = inet_fib_lookup_dev(net, cfg->rc_via); + break; + case NEIGH_ND_TABLE: + dev = inet6_fib_lookup_dev(net, cfg->rc_via); + break; + case NEIGH_LINK_TABLE: + break; + } + } else { + dev = dev_get_by_index(net, cfg->rc_ifindex); + } + + return dev; +} + static int mpls_route_add(struct mpls_route_config *cfg) { struct mpls_route __rcu **platform_label; @@ -358,7 +426,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) goto errout; err = -ENODEV; - dev = dev_get_by_index(net, cfg->rc_ifindex); + dev = find_outdev(net, cfg); if (!dev) goto errout; @@ -626,6 +694,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, return 0; } +EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]) @@ -671,6 +740,7 @@ int nla_get_labels(const struct nlattr *nla, *labels = nla_labels; return 0; } +EXPORT_SYMBOL_GPL(nla_get_labels); static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg) diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 8cabeb5..2681a4b 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -50,7 +50,12 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } -int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]); +int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, + const u32 label[]); +int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, + u32 label[]); +bool mpls_output_possible(const struct net_device *dev); +unsigned int mpls_dev_mtu(const struct net_device *dev); +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); #endif /* MPLS_INTERNAL_H */ diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c new file mode 100644 index 0000000..276f8c9 --- /dev/null +++ b/net/mpls/mpls_iptunnel.c @@ -0,0 +1,233 @@ +/* + * mpls tunnels An implementation mpls tunnels using the light weight tunnel + * infrastructure + * + * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/module.h> +#include <linux/mpls.h> +#include <linux/vmalloc.h> +#include <net/ip.h> +#include <net/dst.h> +#include <net/lwtunnel.h> +#include <net/netevent.h> +#include <net/netns/generic.h> +#include <net/ip6_fib.h> +#include <net/route.h> +#include <net/mpls_iptunnel.h> +#include <linux/mpls_iptunnel.h> +#include "internal.h" + +static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { + [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, +}; + +static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return en->labels * sizeof(struct mpls_shim_hdr); +} + +int mpls_output(struct sock *sk, struct sk_buff *skb) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = NULL; + struct rt6_info *rt6 = NULL; + struct lwtunnel_state *lwtstate = NULL; + int err = 0; + bool bos; + int i; + unsigned int ttl; + + /* Obtain the ttl */ + if (skb->protocol == htons(ETH_P_IP)) { + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + lwtstate = rt->rt_lwtstate; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + lwtstate = rt6->rt6i_lwtstate; + } else { + goto drop; + } + + skb_orphan(skb); + + /* Find the output device */ + out_dev = dst->dev; + if (!mpls_output_possible(out_dev) || + !lwtstate || skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + /* Verify the destination can hold the packet */ + new_header_size = mpls_encap_size(tun_encap_info); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = true; + for (i = tun_encap_info->labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(tun_encap_info->label[i], + ttl, 0, bos); + bos = false; + } + + if (rt) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, + skb); + else if (rt6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return 0; + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + int tun_encap_info_len; + int ret; + + ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy); + if (ret < 0) + return ret; + + if (!tb[MPLS_IPTUNNEL_DST]) + return -EINVAL; + + tun_encap_info_len = sizeof(*tun_encap_info); + + newts = lwtunnel_state_alloc(tun_encap_info_len); + if (!newts) + return -ENOMEM; + + newts->len = tun_encap_info_len; + tun_encap_info = mpls_lwtunnel_encap(newts); + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &tun_encap_info->labels, tun_encap_info->label); + if (ret) + goto errout; + newts->type = LWTUNNEL_ENCAP_MPLS; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; + +errout: + kfree(newts); + *ts = NULL; + + return ret; +} + +static int mpls_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, + tun_encap_info->label)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + return nla_total_size(tun_encap_info->labels * 4); +} + +static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); + struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); + int l; + + if (a_hdr->labels != b_hdr->labels) + return 1; + + for (l = 0; l < MAX_NEW_LABELS; l++) + if (a_hdr->label[l] != b_hdr->label[l]) + return 1; + return 0; +} + +static const struct lwtunnel_encap_ops mpls_iptun_ops = { + .build_state = mpls_build_state, + .output = mpls_output, + .fill_encap = mpls_fill_encap_info, + .get_encap_size = mpls_encap_nlsize, + .cmp_encap = mpls_encap_cmp, +}; + +static int __init mpls_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_init(mpls_iptunnel_init); + +static void __exit mpls_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_exit(mpls_iptunnel_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); +MODULE_LICENSE("GPL v2"); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 52561e1..cb2f13e 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -166,11 +166,13 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; *dest = out->group; break; +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: if (skb->sk == NULL || !sk_fullsock(skb->sk)) goto err; *dest = skb->sk->sk_classid; break; +#endif default: WARN_ON(1); goto err; @@ -246,7 +248,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_CPU: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: +#endif len = sizeof(u32); break; case NFT_META_IIFNAME: diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index cca96ce..d0c96c5 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -272,8 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -437,8 +436,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, tgi->lport ? tgi->lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 1584040..1119f46 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -44,18 +44,6 @@ config OPENVSWITCH_GRE If unsure, say Y. -config OPENVSWITCH_VXLAN - tristate "Open vSwitch VXLAN tunneling support" - depends on OPENVSWITCH - depends on VXLAN - default OPENVSWITCH - ---help--- - If you say Y here, then the Open vSwitch will be able create vxlan vport. - - Say N to exclude this support and reduce the binary size. - - If unsure, say Y. - config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 91b9478..38e0e14 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -16,5 +16,4 @@ openvswitch-y := \ vport-netdev.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o -obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8a8c0b8..cf04c2f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -611,7 +611,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, const struct nlattr *actions, int actions_len) { - struct ovs_tunnel_info info; + struct ip_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; @@ -733,7 +733,15 @@ static int execute_set_action(struct sk_buff *skb, { /* Only tunnel set execution is supported without a mask. */ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_CB(skb)->egress_tun_info = nla_data(a); + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); + + /* FIXME: Remove when all vports have been converted */ + OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info; + return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ff8c4a4..ffe984f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -176,7 +176,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) @@ -188,7 +188,7 @@ static int get_dpifindex(const struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -1018,7 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_unlock(); - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } @@ -1030,7 +1030,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1157,7 +1157,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); return 0; @@ -1165,7 +1165,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); error: return error; } @@ -1800,7 +1800,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - vport->ops->get_name(vport))) + ovs_vport_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -2219,13 +2219,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (dev_net(netdev_vport->dev) == dnet) + if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index cd691e9..6b28c5c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -25,6 +25,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> +#include <net/ip_tunnels.h> #include "flow.h" #include "flow_table.h" @@ -98,7 +99,7 @@ struct datapath { * when a packet is received by OVS. */ struct ovs_skb_cb { - struct ovs_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -114,7 +115,7 @@ struct ovs_skb_cb { * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. */ struct dp_upcall_info { - const struct ovs_tunnel_info *egress_tun_info; + const struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 2c631fe..a7a80a6 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) continue; - netdev_vport = netdev_vport_priv(vport); - if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bc7b0ab..8db22ef 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ if (tun_info) { - memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); if (tun_info->options) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a076e44..b62cdb3 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -32,31 +32,11 @@ #include <linux/time.h> #include <linux/flex_array.h> #include <net/inet_ecn.h> +#include <net/ip_tunnels.h> +#include <net/dst_metadata.h> struct sk_buff; -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; - __be16 tp_src; - __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ - -struct ovs_tunnel_info { - struct ovs_key_ipv4_tunnel tunnel; - const void *options; - u8 options_len; -}; - /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -66,54 +46,9 @@ struct ovs_tunnel_info { #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) -static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = saddr; - tun_info->tunnel.ipv4_dst = daddr; - tun_info->tunnel.ipv4_tos = tos; - tun_info->tunnel.ipv4_ttl = ttl; - tun_info->tunnel.tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - tun_info->tunnel.tp_src = tp_src; - tun_info->tunnel.tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; -} - -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, - tp_src, tp_dst, - tun_id, tun_flags, - opts, opts_len); -} +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ @@ -122,7 +57,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ @@ -273,7 +208,7 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 624e41c..a6eb77a 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -47,9 +47,9 @@ #include <net/ipv6.h> #include <net/ndisc.h> #include <net/mpls.h> +#include <net/vxlan.h> #include "flow_netlink.h" -#include "vport-vxlan.h" struct ovs_len_tbl { int len; @@ -475,7 +475,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, { struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; unsigned long opt_key_offset; - struct ovs_vxlan_opts opts; + struct vxlan_metadata opts; int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); @@ -626,7 +626,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, static int vxlan_opt_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len) { - const struct ovs_vxlan_opts *opts = tun_opts; + const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); @@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, } static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_KEY && @@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, } static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { struct nlattr *nla; @@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, } int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ovs_tunnel_info *egress_tun_info) + const struct ip_tunnel_info *egress_tun_info) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, egress_tun_info->options, egress_tun_info->options_len); } @@ -1548,11 +1548,48 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) return sfa; } +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + } + } + + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + /* Schedules 'sf_acts' to be freed after the next RCU grace period. * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) { - kfree_rcu(sf_acts, rcu); + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1746,7 +1783,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; - struct ovs_tunnel_info *tun_info; + struct metadata_dst *tun_dst; + struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; @@ -1771,13 +1810,23 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len, log); - if (IS_ERR(a)) + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); return PTR_ERR(a); + } + + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; - tun_info = nla_data(a); - tun_info->tunnel = key.tun_key; + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->key = key.tun_key; tun_info->options_len = key.tun_opts_len; if (tun_info->options_len) { @@ -2177,7 +2226,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) - kfree(*sfa); + ovs_nla_free_flow_actions(*sfa); return err; } @@ -2227,13 +2276,14 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + err = ipv4_tun_to_nlattr(skb, &tun_info->key, tun_info->options_len ? tun_info->options : NULL, tun_info->options_len); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 5c3d75b..acd0744 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ovs_tunnel_info *); + const struct ip_tunnel_info *); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, @@ -69,5 +69,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 6552394..3a9d1dde76 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -18,6 +18,7 @@ #include "flow.h" #include "datapath.h" +#include "flow_netlink.h" #include <linux/uaccess.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> @@ -143,7 +144,8 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); - kfree((struct sw_flow_actions __force *)flow->sf_acts); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) kmem_cache_free(flow_stats_cache, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 208c576..1da3a14 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) struct vport *vport = gs->rcv_data; struct genevehdr *geneveh = geneve_hdr(skb); int opts_len; - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; __be64 key; __be16 flags; @@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) key = vni_to_tunnel_id(geneveh->vni); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, - geneveh->options, opts_len); + ip_tunnel_info_init(&tun_info, ip_hdr(skb), + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, geneveh->options, opts_len); ovs_vport_receive(vport, skb, &tun_info); } @@ -165,8 +164,8 @@ error: static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct ovs_tunnel_info *tun_info; + const struct ip_tunnel_key *tun_key; + struct ip_tunnel_info *tun_info; struct net *net = ovs_dp_get_net(vport->dp); struct geneve_port *geneve_port = geneve_vport(vport); __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; @@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - tun_key = &tun_info->tunnel; + tun_key = &tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport) } static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { struct geneve_port *geneve_port = geneve_vport(vport); struct net *net = ovs_dp_get_net(vport->dp); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index f17ac96..b87656c 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -67,9 +67,9 @@ static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { struct tnl_ptk_info tpi; - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -97,7 +97,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -108,8 +108,8 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); + ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key, + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; @@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -147,7 +147,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_skb; } - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -277,7 +277,7 @@ static void gre_tnl_destroy(struct vport *vport) } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { return ovs_tunnel_get_egress_info(egress_tun_info, ovs_dp_get_net(vport->dp), diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 6a55f71..c058bbf 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -156,49 +156,44 @@ static void do_setup(struct net_device *netdev) static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, - do_setup); - if (!netdev_vport->dev) { + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) goto error_free_netdev; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; error_free_netdev: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); error: @@ -207,21 +202,19 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); + unregister_netdevice(vport->dev); rtnl_unlock(); } static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; + struct net_device *netdev = vport->dev; int len; if (unlikely(!(netdev->flags & IFF_UP))) { @@ -249,7 +242,6 @@ static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 33e6d6e..68d0582 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -27,9 +27,13 @@ #include <linux/skbuff.h> #include <linux/openvswitch.h> -#include <net/llc.h> +#include <net/udp.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> +#include <net/vxlan.h> #include "datapath.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -83,104 +87,93 @@ static struct net_device *get_dpdev(const struct datapath *dp) local = ovs_vport_ovsl(dp, OVSP_LOCAL); BUG_ON(!local); - return netdev_vport_priv(local)->dev; + return local->dev; } -static struct vport *netdev_create(const struct vport_parms *parms) +static struct vport *netdev_link(struct vport *vport, const char *name) { - struct vport *vport; - struct netdev_vport *netdev_vport; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_master_upper_dev_link(netdev_vport->dev, + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp)); if (err) goto error_unlock; - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; - dev_disable_lro(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + return netdev_link(vport, parms->name); +} + static void free_port_rcu(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport = container_of(rcu, struct vport, rcu); - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + if (vport->dev) + dev_put(vport->dev); + ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - ASSERT_RTNL(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, - netdev_master_upper_dev_get(netdev_vport->dev)); - dev_set_promiscuity(netdev_vport->dev, -1); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); - if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&netdev_vport->rcu, free_port_rcu); -} - -const char *ovs_netdev_get_name(const struct vport *vport) -{ - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; + call_rcu(&vport->rcu, free_port_rcu); } static unsigned int packet_length(const struct sk_buff *skb) @@ -195,18 +188,17 @@ static unsigned int packet_length(const struct sk_buff *skb) static int netdev_send(struct vport *vport, struct sk_buff *skb) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; + int mtu = vport->dev->mtu; int len; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, + vport->dev->name, packet_length(skb), mtu); goto drop; } - skb->dev = netdev_vport->dev; + skb->dev = vport->dev; len = skb->len; dev_queue_xmit(skb); @@ -231,16 +223,205 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, .send = netdev_send, }; +/* Compat code for old userspace. */ +#if IS_ENABLED(CONFIG_VXLAN) +static struct vport_ops ovs_vxlan_netdev_vport_ops; + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + if (exts[OVS_VXLAN_EXT_GBP]) + conf->flags |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA, + }; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + conf.dst_port = htons(nla_get_u16(a)); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a, &conf); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *vxlan_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return netdev_link(vport, parms->name); +} + +static void vxlan_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + + /* Early release so we can unregister the device */ + dev_put(vport->dev); + rtnl_delete_link(vport->dev); + vport->dev = NULL; + rtnl_unlock(); + + call_rcu(&vport->rcu, free_port_rcu); +} + +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ip_tunnel_info *egress_tun_info) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dst_port = vxlan_dev_dst_port(vxlan); + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = vxlan_destroy, + .get_options = vxlan_get_options, + .send = netdev_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, +}; + +static int vxlan_compat_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); +} + +static void vxlan_compat_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); +} +#else +static int vxlan_compat_init(void) +{ + return 0; +} + +static void vxlan_compat_exit(void) +{ +} +#endif + int __init ovs_netdev_init(void) { - return ovs_vport_ops_register(&ovs_netdev_vport_ops); + int err; + + err = ovs_vport_ops_register(&ovs_netdev_vport_ops); + if (err) + return err; + err = vxlan_compat_init(); + if (err) + vxlan_compat_exit(); + return err; } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); + vxlan_compat_exit(); } diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 6f7038e..684fb88 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -26,19 +26,6 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; - - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - -const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c deleted file mode 100644 index 6d39766..0000000 --- a/net/openvswitch/vport-vxlan.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright (c) 2014 Nicira, Inc. - * Copyright (c) 2013 Cisco Systems, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/rculist.h> -#include <linux/udp.h> -#include <linux/module.h> - -#include <net/icmp.h> -#include <net/ip.h> -#include <net/udp.h> -#include <net/ip_tunnels.h> -#include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/vxlan.h> - -#include "datapath.h" -#include "vport.h" -#include "vport-vxlan.h" - -/** - * struct vxlan_port - Keeps track of open UDP ports - * @vs: vxlan_sock created for the port. - * @name: vport name. - */ -struct vxlan_port { - struct vxlan_sock *vs; - char name[IFNAMSIZ]; - u32 exts; /* VXLAN_F_* in <net/vxlan.h> */ -}; - -static struct vport_ops ovs_vxlan_vport_ops; - -static inline struct vxlan_port *vxlan_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -/* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct ovs_tunnel_info tun_info; - struct vxlan_port *vxlan_port; - struct vport *vport = vs->data; - struct iphdr *iph; - struct ovs_vxlan_opts opts = { - .gbp = md->gbp, - }; - __be64 key; - __be16 flags; - - flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); - vxlan_port = vxlan_vport(vport); - if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) - flags |= TUNNEL_VXLAN_OPT; - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(md->vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); - - ovs_vport_receive(vport, skb, &tun_info); -} - -static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) - return -EMSGSIZE; - - if (vxlan_port->exts) { - struct nlattr *exts; - - exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); - if (!exts) - return -EMSGSIZE; - - if (vxlan_port->exts & VXLAN_F_GBP && - nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) - return -EMSGSIZE; - - nla_nest_end(skb, exts); - } - - return 0; -} - -static void vxlan_tnl_destroy(struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - - vxlan_sock_release(vxlan_port->vs); - - ovs_vport_deferred_free(vport); -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, -}; - -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) -{ - struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; - struct vxlan_port *vxlan_port; - int err; - - if (nla_len(attr) < sizeof(struct nlattr)) - return -EINVAL; - - err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); - if (err < 0) - return err; - - vxlan_port = vxlan_vport(vport); - - if (exts[OVS_VXLAN_EXT_GBP]) - vxlan_port->exts |= VXLAN_F_GBP; - - return 0; -} - -static struct vport *vxlan_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct vxlan_port *vxlan_port; - struct vxlan_sock *vs; - struct vport *vport; - struct nlattr *a; - u16 dst_port; - int err; - - if (!options) { - err = -EINVAL; - goto error; - } - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(sizeof(struct vxlan_port), - &ovs_vxlan_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - vxlan_port = vxlan_vport(vport); - strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); - if (a) { - err = vxlan_configure_exts(vport, a); - if (err) { - ovs_vport_free(vport); - goto error; - } - } - - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, - vxlan_port->exts); - if (IS_ERR(vs)) { - ovs_vport_free(vport); - return (void *)vs; - } - vxlan_port->vs = vs; - - return vport; - -error: - return ERR_PTR(err); -} - -static int vxlan_ext_gbp(struct sk_buff *skb) -{ - const struct ovs_tunnel_info *tun_info; - const struct ovs_vxlan_opts *opts; - - tun_info = OVS_CB(skb)->egress_tun_info; - opts = tun_info->options; - - if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && - tun_info->options_len >= sizeof(*opts)) - return opts->gbp; - else - return 0; -} - -static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - struct sock *sk = vxlan_port->vs->sock->sk; - __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ovs_key_ipv4_tunnel *tun_key; - struct vxlan_metadata md = {0}; - struct rtable *rt; - struct flowi4 fl; - __be16 src_port; - __be16 df; - int err; - u32 vxflags; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - src_port = udp_flow_src_port(net, skb, 0, 0, true); - md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); - md.gbp = vxlan_ext_gbp(skb); - vxflags = vxlan_port->exts | - (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - - err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, - src_port, dst_port, - &md, false, vxflags); - if (err < 0) - ip_rt_put(rt); - return err; -error: - kfree_skb(skb); - return err; -} - -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); - - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - src_port, dst_port); -} - -static const char *vxlan_get_name(const struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - return vxlan_port->name; -} - -static struct vport_ops ovs_vxlan_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_tnl_create, - .destroy = vxlan_tnl_destroy, - .get_name = vxlan_get_name, - .get_options = vxlan_get_options, - .send = vxlan_tnl_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, - .owner = THIS_MODULE, -}; - -static int __init ovs_vxlan_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_vxlan_vport_ops); -} - -static void __exit ovs_vxlan_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); -} - -module_init(ovs_vxlan_tnl_init); -module_exit(ovs_vxlan_tnl_exit); - -MODULE_DESCRIPTION("OVS: VXLAN switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h deleted file mode 100644 index 4b08233e..0000000 --- a/net/openvswitch/vport-vxlan.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef VPORT_VXLAN_H -#define VPORT_VXLAN_H 1 - -#include <linux/kernel.h> -#include <linux/types.h> - -struct ovs_vxlan_opts { - __u32 gbp; -}; - -#endif diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 067a3ff..d14f594 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -113,7 +113,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -226,7 +226,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) } bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); + ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } @@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ovs_tunnel_info *tun_info) + const struct ip_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ovs_tunnel_info *tun_info, + const struct ip_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst) { - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct rtable *rt; struct flowi4 fl; if (unlikely(!tun_info)) return -EINVAL; - tun_key = &tun_info->tunnel; + tun_key = &tun_info->key; /* Route lookup to get srouce IP address. * The process may need to be changed if the corresponding process @@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ - __ovs_flow_tun_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); + __ip_tunnel_info_init(egress_tun_info, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, + tun_key->ipv4_ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags, + tun_info->options, + tun_info->options_len); return 0; } EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info) + struct ip_tunnel_info *info) { /* get_egress_tun_info() is only implemented on tunnel ports. */ if (unlikely(!vport->ops->get_egress_tun_info)) diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index bc85331..1a689c2 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -27,6 +27,7 @@ #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> +#include <net/route.h> #include "datapath.h" @@ -58,15 +59,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); int ovs_vport_send(struct vport *, struct sk_buff *); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ovs_tunnel_info *tun_info, + const struct ip_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info); + struct ip_tunnel_info *info); /* The following definitions are for implementers of vport devices: */ @@ -106,7 +107,7 @@ struct vport_portids { * @detach_list: list used for detaching vport in net-exit call. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -119,6 +120,7 @@ struct vport { struct vport_err_stats err_stats; struct list_head detach_list; + struct rcu_head rcu; }; /** @@ -176,7 +178,7 @@ struct vport_ops { int (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); + struct ip_tunnel_info *); struct module *owner; struct list_head list; @@ -226,7 +228,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ovs_tunnel_info *); + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -235,11 +237,16 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev ? vport->dev->name : vport->ops->get_name(vport); +} + int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ovs_key_ipv4_tunnel *key, + const struct ip_tunnel_key *key, u32 mark, struct flowi4 *fl, u8 protocol) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index af427a3..074a32f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -27,6 +27,15 @@ #include <net/act_api.h> #include <net/netlink.h> +static void free_tcf(struct rcu_head *head) +{ + struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + + free_percpu(p->cpu_bstats); + free_percpu(p->cpu_qstats); + kfree(p); +} + void tcf_hash_destroy(struct tc_action *a) { struct tcf_common *p = a->priv; @@ -41,7 +50,7 @@ void tcf_hash_destroy(struct tc_action *a) * gen_estimator est_timer() might access p->tcfc_lock * or bstats, wait a RCU grace period before freeing p */ - kfree_rcu(p, tcfc_rcu); + call_rcu(&p->tcfc_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_destroy); @@ -230,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) if (est) gen_kill_estimator(&pc->tcfc_bstats, &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); + call_rcu(&pc->tcfc_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind) + int size, int bind, bool cpustats) { struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; @@ -246,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, if (bind) p->tcfc_bindcnt = 1; + if (cpustats) { + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!p->cpu_bstats) { +err1: + kfree(p); + return err; + } + p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!p->cpu_qstats) { +err2: + free_percpu(p->cpu_bstats); + goto err1; + } + } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, NULL, - &p->tcfc_rate_est, - &p->tcfc_lock, est); + err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, + &p->tcfc_rate_est, + &p->tcfc_lock, est); if (err) { - kfree(p); - return err; + free_percpu(p->cpu_qstats); + goto err2; } } @@ -615,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, NULL, + gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfc_qstats, p->tcfc_qstats.qlen) < 0) goto errout; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 1df7828..e9e923a 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -281,7 +281,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, act, bind)) { ret = tcf_hash_create(parm->index, est, act, - sizeof(*prog), bind); + sizeof(*prog), bind, false); if (ret < 0) goto destroy_fp; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 295d14b..f2b5402 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -108,7 +108,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), + bind, false); if (ret) return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4cd5cf1..b07c535 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_CSUM_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 7fffc22..5c1b051 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -28,14 +28,18 @@ #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { - if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (prandom_u32() % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } static int gact_determ(struct tcf_gact *gact) { - if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval) + u32 pack = atomic_inc_return(&gact->packets); + + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (pack % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } @@ -85,7 +89,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #endif if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -99,16 +104,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(a); - spin_lock_bh(&gact->tcf_lock); + ASSERT_RTNL(); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { gact->tcfg_paction = p_parm->paction; - gact->tcfg_pval = p_parm->pval; + gact->tcfg_pval = max_t(u16, 1, p_parm->pval); + /* Make sure tcfg_pval is written before tcfg_ptype + * coupled with smp_rmb() in gact_net_rand() & gact_determ() + */ + smp_wmb(); gact->tcfg_ptype = p_parm->ptype; } #endif - spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) tcf_hash_insert(a); return ret; @@ -118,23 +126,21 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = a->priv; - int action = TC_ACT_SHOT; + int action = READ_ONCE(gact->tcf_action); - spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype) - action = gact_rand[gact->tcfg_ptype](gact); - else - action = gact->tcf_action; -#else - action = gact->tcf_action; + { + u32 ptype = READ_ONCE(gact->tcfg_ptype); + + if (ptype) + action = gact_rand[ptype](gact); + } #endif - gact->tcf_bstats.bytes += qdisc_pkt_len(skb); - gact->tcf_bstats.packets++; + bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); if (action == TC_ACT_SHOT) - gact->tcf_qstats.drops++; - gact->tcf_tm.lastuse = jiffies; - spin_unlock(&gact->tcf_lock); + qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + + tcf_lastuse_update(&gact->tcf_tm); return action; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index cbc8dd7..99c9cc1 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, index = nla_get_u32(tb[TCA_IPT_INDEX]); if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index a42a3b2..19cd890 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -35,9 +35,11 @@ static LIST_HEAD(mirred_list); static void tcf_mirred_release(struct tc_action *a, int bind) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); + list_del(&m->tcfm_list); - if (m->tcfm_dev) - dev_put(m->tcfm_dev); + if (dev) + dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -93,7 +95,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -105,18 +108,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } m = to_mirred(a); - spin_lock_bh(&m->tcf_lock); + ASSERT_RTNL(); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; if (ret != ACT_P_CREATED) - dev_put(m->tcfm_dev); + dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); - m->tcfm_dev = dev; + rcu_assign_pointer(m->tcfm_dev, dev); m->tcfm_ok_push = ok_push; } - spin_unlock_bh(&m->tcf_lock); + if (ret == ACT_P_CREATED) { list_add(&m->tcfm_list, &mirred_list); tcf_hash_insert(a); @@ -131,20 +134,22 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_mirred *m = a->priv; struct net_device *dev; struct sk_buff *skb2; + int retval, err; u32 at; - int retval, err = 1; - spin_lock(&m->tcf_lock); - m->tcf_tm.lastuse = jiffies; - bstats_update(&m->tcf_bstats, skb); + tcf_lastuse_update(&m->tcf_tm); + + bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - dev = m->tcfm_dev; - if (!dev) { - printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + rcu_read_lock(); + retval = READ_ONCE(m->tcf_action); + dev = rcu_dereference(m->tcfm_dev); + if (unlikely(!dev)) { + pr_notice_once("tc mirred: target device is gone\n"); goto out; } - if (!(dev->flags & IFF_UP)) { + if (unlikely(!(dev->flags & IFF_UP))) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto out; @@ -152,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, at = G_TC_AT(skb->tc_verd); skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL) + if (!skb2) goto out; if (!(at & AT_EGRESS)) { @@ -168,16 +173,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->dev = dev; err = dev_queue_xmit(skb2); -out: if (err) { - m->tcf_qstats.overlimits++; +out: + qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); if (m->tcfm_eaction != TCA_EGRESS_MIRROR) retval = TC_ACT_SHOT; - else - retval = m->tcf_action; - } else - retval = m->tcf_action; - spin_unlock(&m->tcf_lock); + } + rcu_read_unlock(); return retval; } @@ -216,14 +218,16 @@ static int mirred_device_event(struct notifier_block *unused, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; + ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) list_for_each_entry(m, &mirred_list, tcfm_list) { - spin_lock_bh(&m->tcf_lock); - if (m->tcfm_dev == dev) { + if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); - m->tcfm_dev = NULL; + /* Note : no rcu grace period necessary, as + * net_device are already rcu protected. + */ + RCU_INIT_POINTER(m->tcfm_dev, NULL); } - spin_unlock_bh(&m->tcf_lock); } return NOTIFY_DONE; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 270a030..5be0b3c 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_NAT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 17e6d66..ce8676a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; p = to_pedit(a); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 6a8d948..d6b708d 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, defdata = nla_data(tb[TCA_DEF_DATA]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index fcfeeaf..6751b5f 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index d735ecf..796785e 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*v), + bind, false); if (ret) return ret; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index ea611b21..4c85bd3 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); - u32 classid; - - classid = task_cls_state(current)->classid; - - /* - * Due to the nature of the classifier it is required to ignore all - * packets originating from softirq context as accessing `current' - * would lead to false results. - * - * This test assumes that all callers of dev_queue_xmit() explicitely - * disable bh. Knowing this, it is possible to detect softirq based - * calls by looking at the number of nested bh disable calls because - * softirqs always disables bh. - */ - if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ - if (!skb->sk) - return -1; - classid = skb->sk->sk_classid; - } + u32 classid = task_get_classid(skb); if (!classid) return -1; - if (!tcf_em_tree_match(skb, &head->ematches, NULL)) return -1; res->classid = classid; res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index b8d73bc..ffaeea6 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -186,7 +186,6 @@ struct qfq_sched { u64 oldV, V; /* Precise virtual times. */ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ - u32 num_active_agg; /* Num. of active aggregates */ u32 wsum; /* weight sum */ u32 iwsum; /* inverse weight sum */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 59e8035..4345790 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -487,23 +487,35 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { + struct net_device *odev; + if (!laddr->valid) continue; - if ((laddr->state == SCTP_ADDR_SRC) && - (AF_INET == laddr->a.sa.sa_family)) { - fl4->fl4_sport = laddr->a.v4.sin_port; - flowi4_update_output(fl4, - asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), - daddr->v4.sin_addr.s_addr, - laddr->a.v4.sin_addr.s_addr); - - rt = ip_route_output_key(sock_net(sk), fl4); - if (!IS_ERR(rt)) { - dst = &rt->dst; - goto out_unlock; - } - } + if (laddr->state != SCTP_ADDR_SRC || + AF_INET != laddr->a.sa.sa_family) + continue; + + fl4->fl4_sport = laddr->a.v4.sin_port; + flowi4_update_output(fl4, + asoc->base.sk->sk_bound_dev_if, + RT_CONN_FLAGS(asoc->base.sk), + daddr->v4.sin_addr.s_addr, + laddr->a.v4.sin_addr.s_addr); + + rt = ip_route_output_key(sock_net(sk), fl4); + if (IS_ERR(rt)) + continue; + + /* Ensure the src address belongs to the output + * interface. + */ + odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, + false); + if (!odev || odev->ifindex != fl4->flowi4_oif) + continue; + + dst = &rt->dst; + break; } out_unlock: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3ee27b7..d7eaa73 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -853,7 +853,7 @@ nomem: /* * Respond to a normal COOKIE ACK chunk. - * We are the side that is being asked for an association. + * We are the side that is asking for an association. * * RFC 2960 5.1 Normal Establishment of an Association * diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 9f2add3..33bafa2 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) if (switchdev_port_attr_get(dev, &attr)) return NULL; - if (nhsel > 0) { - if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len) + if (nhsel > 0 && + !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) return NULL; - if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id, - attr.u.ppid.id_len)) - return NULL; - } prev_attr = attr; } @@ -1043,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi) fi->fib_net->ipv4.fib_offload_disabled = true; } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); + +static bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) +{ + struct switchdev_attr a_attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + struct switchdev_attr b_attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + + if (switchdev_port_attr_get(a, &a_attr) || + switchdev_port_attr_get(b, &b_attr)) + return false; + + return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); +} + +static u32 switchdev_port_fwd_mark_get(struct net_device *dev, + struct net_device *group_dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev == dev) + continue; + if (switchdev_port_same_parent_id(dev, lower_dev)) + return lower_dev->offload_fwd_mark; + return switchdev_port_fwd_mark_get(dev, lower_dev); + } + + return dev->ifindex; +} + +static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, + u32 old_mark, u32 *reset_mark) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev->offload_fwd_mark == old_mark) { + if (!*reset_mark) + *reset_mark = lower_dev->ifindex; + lower_dev->offload_fwd_mark = *reset_mark; + } + switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); + } +} + +/** + * switchdev_port_fwd_mark_set - Set port offload forwarding mark + * + * @dev: port device + * @group_dev: containing device + * @joining: true if dev is joining group; false if leaving group + * + * An ungrouped port's offload mark is just its ifindex. A grouped + * port's (member of a bridge, for example) offload mark is the ifindex + * of one of the ports in the group with the same parent (switch) ID. + * Ports on the same device in the same group will have the same mark. + * + * Example: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=5 + * sw2p2 ifindex=5 mark=5 + * + * If sw2p2 leaves the bridge, we'll have: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=4 + * sw2p2 ifindex=5 mark=5 + */ +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ + u32 mark = dev->ifindex; + u32 reset_mark = 0; + + if (group_dev && joining) { + mark = switchdev_port_fwd_mark_get(dev, group_dev); + } else if (group_dev && !joining) { + if (dev->offload_fwd_mark == mark) + /* Ohoh, this port was the mark reference port, + * but it's leaving the group, so reset the + * mark for the remaining ports in the group. + */ + switchdev_port_fwd_mark_reset(group_dev, mark, + &reset_mark); + } + + dev->offload_fwd_mark = mark; +} +EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index a816382..8b010c9 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -316,6 +316,29 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, } } +void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *hdr) +{ + u16 last = msg_last_bcast(hdr); + int mtyp = msg_type(hdr); + + if (unlikely(msg_user(hdr) != LINK_PROTOCOL)) + return; + if (mtyp == STATE_MSG) { + tipc_bclink_update_link_state(n, last); + return; + } + /* Compatibility: older nodes don't know BCAST_PROTOCOL synchronization, + * and transfer synch info in LINK_PROTOCOL messages. + */ + if (tipc_node_is_up(n)) + return; + if ((mtyp != RESET_MSG) && (mtyp != ACTIVATE_MSG)) + return; + n->bclink.last_sent = last; + n->bclink.last_in = last; + n->bclink.oos_state = 0; +} + /** * bclink_peek_nack - monitor retransmission requests sent by other nodes * @@ -358,10 +381,9 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list) /* Prepare clone of message for local node */ skb = tipc_msg_reassemble(list); - if (unlikely(!skb)) { - __skb_queue_purge(list); + if (unlikely(!skb)) return -EHOSTUNREACH; - } + /* Broadcast to all nodes */ if (likely(bclink)) { tipc_bclink_lock(net); @@ -413,7 +435,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) * all nodes in the cluster don't ACK at the same time */ if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) { - tipc_link_proto_xmit(node->active_links[node->addr & 1], + tipc_link_proto_xmit(node_active_link(node, node->addr), STATE_MSG, 0, 0, 0, 0); tn->bcl->stats.sent_acks++; } @@ -925,7 +947,6 @@ int tipc_bclink_init(struct net *net) tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); bcl->bearer_id = MAX_BEARERS; rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer); - bcl->state = WORKING_WORKING; bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg; msg_set_prevnode(bcl->pmsg, tn->own_addr); strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 3c290a48..d74c69b 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -133,5 +133,6 @@ void tipc_bclink_wakeup_users(struct net *net); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); void tipc_bclink_input(struct net *net); +void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *msg); #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 00bc0e6..eae58a6 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -470,6 +470,32 @@ void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf, rcu_read_unlock(); } +/* tipc_bearer_xmit() -send buffer to destination over bearer + */ +void tipc_bearer_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr *dst) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_bearer *b; + struct sk_buff *skb, *tmp; + + if (skb_queue_empty(xmitq)) + return; + + rcu_read_lock(); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (likely(b)) { + skb_queue_walk_safe(xmitq, skb, tmp) { + __skb_dequeue(xmitq); + b->media->send_msg(net, skb, b, dst); + /* Until we remove cloning in tipc_l2_send_msg(): */ + kfree_skb(skb); + } + } + rcu_read_unlock(); +} + /** * tipc_l2_rcv_msg - handle incoming TIPC message from an interface * @buf: the received packet diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index dc714d9..6426f24 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -217,5 +217,8 @@ void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf, struct tipc_media_addr *dest); +void tipc_bearer_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr *dst); #endif /* _TIPC_BEARER_H */ diff --git a/net/tipc/core.h b/net/tipc/core.h index 0fcf133..f4ed677 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -129,6 +129,11 @@ static inline int less(u16 left, u16 right) return less_eq(left, right) && (mod(right) != mod(left)); } +static inline int in_range(u16 val, u16 min, u16 max) +{ + return !less(val, min) && !more(val, max); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 967e292..164d089 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -35,7 +35,7 @@ */ #include "core.h" -#include "link.h" +#include "node.h" #include "discover.h" /* min delay during bearer start up */ @@ -125,7 +125,6 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; - struct tipc_link *link; struct tipc_media_addr maddr; struct sk_buff *rbuf; struct tipc_msg *msg = buf_msg(buf); @@ -170,13 +169,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, return; tipc_node_lock(node); node->capabilities = caps; - link = node->links[bearer->identity]; /* Prepare to validate requesting node's signature and media address */ sign_match = (signature == node->signature); - addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr)); - link_up = link && tipc_link_is_up(link); - + tipc_node_check_dest(node, bearer, &link_up, &addr_match, &maddr); /* These three flags give us eight permutations: */ @@ -239,16 +235,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, if (accept_sign) node->signature = signature; - if (accept_addr) { - if (!link) - link = tipc_link_create(node, bearer, &maddr); - if (link) { - memcpy(&link->media_addr, &maddr, sizeof(maddr)); - tipc_link_reset(link); - } else { - respond = false; - } - } + if (accept_addr && !tipc_node_update_dest(node, bearer, &maddr)) + respond = false; /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { diff --git a/net/tipc/link.c b/net/tipc/link.c index eaa9fe5..b63d573 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -77,36 +77,70 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { }; /* - * Out-of-range value for link session numbers + * Interval between NACKs when packets arrive out of order */ -#define INVALID_SESSION 0x10000 - +#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) /* - * Link state events: + * Out-of-range value for link session numbers */ -#define STARTING_EVT 856384768 /* link processing trigger */ -#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ -#define SILENCE_EVT 560817u /* timer dicovered silence from peer */ +#define WILDCARD_SESSION 0x10000 -/* - * State value stored in 'failover_pkts' +/* State value stored in 'failover_pkts' */ #define FIRST_FAILOVER 0xffffu -static void link_handle_out_of_seq_msg(struct tipc_link *link, - struct sk_buff *skb); -static void tipc_link_proto_rcv(struct tipc_link *link, - struct sk_buff *skb); -static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol); -static void link_state_event(struct tipc_link *l_ptr, u32 event); +/* Link FSM states and events: + */ +enum { + TIPC_LINK_WORKING, + TIPC_LINK_PROBING, + TIPC_LINK_RESETTING, + TIPC_LINK_ESTABLISHING +}; + +enum { + PEER_RESET_EVT = RESET_MSG, + ACTIVATE_EVT = ACTIVATE_MSG, + TRAFFIC_EVT, /* Any other valid msg from peer */ + SILENCE_EVT /* Peer was silent during last timer interval*/ +}; + +/* Link FSM state checking routines + */ +static int link_working(struct tipc_link *l) +{ + return l->state == TIPC_LINK_WORKING; +} + +static int link_probing(struct tipc_link *l) +{ + return l->state == TIPC_LINK_PROBING; +} + +static int link_resetting(struct tipc_link *l) +{ + return l->state == TIPC_LINK_RESETTING; +} + +static int link_establishing(struct tipc_link *l) +{ + return l->state == TIPC_LINK_ESTABLISHING; +} + +static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); +static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, + u16 rcvgap, int tolerance, int priority, + struct sk_buff_head *xmitq); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static void tipc_link_sync_xmit(struct tipc_link *l); +static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); -static void link_set_timer(struct tipc_link *link, unsigned long time); + /* * Simple link routines */ @@ -115,26 +149,13 @@ static unsigned int align(unsigned int i) return (i + 3) & ~3u; } -static void tipc_link_release(struct kref *kref) -{ - kfree(container_of(kref, struct tipc_link, ref)); -} - -static void tipc_link_get(struct tipc_link *l_ptr) -{ - kref_get(&l_ptr->ref); -} - -static void tipc_link_put(struct tipc_link *l_ptr) -{ - kref_put(&l_ptr->ref, tipc_link_release); -} - static struct tipc_link *tipc_parallel_link(struct tipc_link *l) { - if (l->owner->active_links[0] != l) - return l->owner->active_links[0]; - return l->owner->active_links[1]; + struct tipc_node *n = l->owner; + + if (node_active_link(n, 0) != l) + return node_active_link(n, 0); + return node_active_link(n, 1); } /* @@ -144,74 +165,14 @@ int tipc_link_is_up(struct tipc_link *l_ptr) { if (!l_ptr) return 0; - return link_working_working(l_ptr) || link_working_unknown(l_ptr); + return link_working(l_ptr) || link_probing(l_ptr); } -int tipc_link_is_active(struct tipc_link *l_ptr) +int tipc_link_is_active(struct tipc_link *l) { - return (l_ptr->owner->active_links[0] == l_ptr) || - (l_ptr->owner->active_links[1] == l_ptr); -} - -/** - * link_timeout - handle expiration of link timer - * @l_ptr: pointer to link - */ -static void link_timeout(unsigned long data) -{ - struct tipc_link *l_ptr = (struct tipc_link *)data; - struct sk_buff *skb; - - tipc_node_lock(l_ptr->owner); - - /* update counters used in statistical profiling of send traffic */ - l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq); - l_ptr->stats.queue_sz_counts++; - - skb = skb_peek(&l_ptr->transmq); - if (skb) { - struct tipc_msg *msg = buf_msg(skb); - u32 length = msg_size(msg); - - if ((msg_user(msg) == MSG_FRAGMENTER) && - (msg_type(msg) == FIRST_FRAGMENT)) { - length = msg_size(msg_get_wrapped(msg)); - } - if (length) { - l_ptr->stats.msg_lengths_total += length; - l_ptr->stats.msg_length_counts++; - if (length <= 64) - l_ptr->stats.msg_length_profile[0]++; - else if (length <= 256) - l_ptr->stats.msg_length_profile[1]++; - else if (length <= 1024) - l_ptr->stats.msg_length_profile[2]++; - else if (length <= 4096) - l_ptr->stats.msg_length_profile[3]++; - else if (length <= 16384) - l_ptr->stats.msg_length_profile[4]++; - else if (length <= 32768) - l_ptr->stats.msg_length_profile[5]++; - else - l_ptr->stats.msg_length_profile[6]++; - } - } - - /* do all other link processing performed on a periodic basis */ - if (l_ptr->silent_intv_cnt || tipc_bclink_acks_missing(l_ptr->owner)) - link_state_event(l_ptr, SILENCE_EVT); - l_ptr->silent_intv_cnt++; - if (skb_queue_len(&l_ptr->backlogq)) - tipc_link_push_packets(l_ptr); - link_set_timer(l_ptr, l_ptr->keepalive_intv); - tipc_node_unlock(l_ptr->owner); - tipc_link_put(l_ptr); -} + struct tipc_node *n = l->owner; -static void link_set_timer(struct tipc_link *link, unsigned long time) -{ - if (!mod_timer(&link->timer, jiffies + time)) - tipc_link_get(link); + return (node_active_link(n, 0) == l) || (node_active_link(n, 1) == l); } /** @@ -224,7 +185,9 @@ static void link_set_timer(struct tipc_link *link, unsigned long time) */ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr) + const struct tipc_media_addr *media_addr, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq) { struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); struct tipc_link *l_ptr; @@ -240,7 +203,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, return NULL; } - if (n_ptr->links[b_ptr->identity]) { + if (n_ptr->links[b_ptr->identity].link) { tipc_addr_string_fill(addr_string, n_ptr->addr); pr_err("Attempt to establish second link on <%s> to %s\n", b_ptr->name, addr_string); @@ -252,7 +215,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, pr_warn("Link creation failed, no memory\n"); return NULL; } - kref_init(&l_ptr->ref); l_ptr->addr = peer; if_name = strchr(b_ptr->name, ':') + 1; sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown", @@ -263,10 +225,10 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, /* note: peer i/f name is updated by reset/activate message */ memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); l_ptr->owner = n_ptr; - l_ptr->peer_session = INVALID_SESSION; + l_ptr->peer_session = WILDCARD_SESSION; l_ptr->bearer_id = b_ptr->identity; - link_set_supervision_props(l_ptr, b_ptr->tolerance); - l_ptr->state = RESET_UNKNOWN; + l_ptr->tolerance = b_ptr->tolerance; + l_ptr->state = TIPC_LINK_RESETTING; l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; msg = l_ptr->pmsg; @@ -286,13 +248,11 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, __skb_queue_head_init(&l_ptr->backlogq); __skb_queue_head_init(&l_ptr->deferdq); skb_queue_head_init(&l_ptr->wakeupq); - skb_queue_head_init(&l_ptr->inputq); - skb_queue_head_init(&l_ptr->namedq); + l_ptr->inputq = inputq; + l_ptr->namedq = namedq; + skb_queue_head_init(l_ptr->inputq); link_reset_statistics(l_ptr); tipc_node_attach_link(n_ptr, l_ptr); - setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr); - link_state_event(l_ptr, STARTING_EVT); - return l_ptr; } @@ -303,13 +263,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, void tipc_link_delete(struct tipc_link *l) { tipc_link_reset(l); - if (del_timer(&l->timer)) - tipc_link_put(l); - l->flags |= LINK_STOPPED; - /* Delete link now, or when timer is finished: */ tipc_link_reset_fragments(l); tipc_node_detach_link(l->owner, l); - tipc_link_put(l); } void tipc_link_delete_list(struct net *net, unsigned int bearer_id) @@ -321,7 +276,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id) rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (link) tipc_link_delete(link); tipc_node_unlock(node); @@ -329,12 +284,219 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id) rcu_read_unlock(); } +/* tipc_link_build_bcast_sync_msg() - synchronize broadcast link endpoints. + * + * Give a newly added peer node the sequence number where it should + * start receiving and acking broadcast packets. + */ +static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb; + struct sk_buff_head list; + + skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, + 0, l->addr, link_own_addr(l), 0, 0, 0); + if (!skb) + return; + __skb_queue_head_init(&list); + __skb_queue_tail(&list, skb); + tipc_link_xmit(l, &list, xmitq); +} + +/** + * tipc_link_fsm_evt - link finite state machine + * @l: pointer to link + * @evt: state machine event to be processed + * @xmitq: queue to prepend created protocol message, if any + */ +static int tipc_link_fsm_evt(struct tipc_link *l, int evt, + struct sk_buff_head *xmitq) +{ + int mtyp = 0, rc = 0; + struct tipc_link *pl; + enum { + LINK_RESET = 1, + LINK_ACTIVATE = (1 << 1), + SND_PROBE = (1 << 2), + SND_STATE = (1 << 3), + SND_RESET = (1 << 4), + SND_ACTIVATE = (1 << 5), + SND_BCAST_SYNC = (1 << 6) + } actions = 0; + + if (l->exec_mode == TIPC_LINK_BLOCKED) + return rc; + + switch (l->state) { + case TIPC_LINK_WORKING: + switch (evt) { + case TRAFFIC_EVT: + case ACTIVATE_EVT: + break; + case SILENCE_EVT: + l->state = TIPC_LINK_PROBING; + actions |= SND_PROBE; + break; + case PEER_RESET_EVT: + actions |= LINK_RESET | SND_ACTIVATE; + break; + default: + pr_debug("%s%u WORKING\n", link_unk_evt, evt); + } + break; + case TIPC_LINK_PROBING: + switch (evt) { + case TRAFFIC_EVT: + case ACTIVATE_EVT: + l->state = TIPC_LINK_WORKING; + break; + case PEER_RESET_EVT: + actions |= LINK_RESET | SND_ACTIVATE; + break; + case SILENCE_EVT: + if (l->silent_intv_cnt <= l->abort_limit) { + actions |= SND_PROBE; + break; + } + actions |= LINK_RESET | SND_RESET; + break; + default: + pr_err("%s%u PROBING\n", link_unk_evt, evt); + } + break; + case TIPC_LINK_RESETTING: + switch (evt) { + case TRAFFIC_EVT: + break; + case ACTIVATE_EVT: + pl = node_active_link(l->owner, 0); + if (pl && link_probing(pl)) + break; + actions |= LINK_ACTIVATE; + if (!l->owner->working_links) + actions |= SND_BCAST_SYNC; + break; + case PEER_RESET_EVT: + l->state = TIPC_LINK_ESTABLISHING; + actions |= SND_ACTIVATE; + break; + case SILENCE_EVT: + actions |= SND_RESET; + break; + default: + pr_err("%s%u in RESETTING\n", link_unk_evt, evt); + } + break; + case TIPC_LINK_ESTABLISHING: + switch (evt) { + case TRAFFIC_EVT: + case ACTIVATE_EVT: + pl = node_active_link(l->owner, 0); + if (pl && link_probing(pl)) + break; + actions |= LINK_ACTIVATE; + if (!l->owner->working_links) + actions |= SND_BCAST_SYNC; + break; + case PEER_RESET_EVT: + break; + case SILENCE_EVT: + actions |= SND_ACTIVATE; + break; + default: + pr_err("%s%u ESTABLISHING\n", link_unk_evt, evt); + } + break; + default: + pr_err("Unknown link state %u/%u\n", l->state, evt); + } + + /* Perform actions as decided by FSM */ + if (actions & LINK_RESET) { + l->exec_mode = TIPC_LINK_BLOCKED; + rc |= TIPC_LINK_DOWN_EVT; + } + if (actions & LINK_ACTIVATE) { + l->exec_mode = TIPC_LINK_OPEN; + rc |= TIPC_LINK_UP_EVT; + } + if (actions & (SND_STATE | SND_PROBE)) + mtyp = STATE_MSG; + if (actions & SND_RESET) + mtyp = RESET_MSG; + if (actions & SND_ACTIVATE) + mtyp = ACTIVATE_MSG; + if (actions & (SND_PROBE | SND_STATE | SND_RESET | SND_ACTIVATE)) + tipc_link_build_proto_msg(l, mtyp, actions & SND_PROBE, + 0, 0, 0, xmitq); + if (actions & SND_BCAST_SYNC) + tipc_link_build_bcast_sync_msg(l, xmitq); + return rc; +} + +/* link_profile_stats - update statistical profiling of traffic + */ +static void link_profile_stats(struct tipc_link *l) +{ + struct sk_buff *skb; + struct tipc_msg *msg; + int length; + + /* Update counters used in statistical profiling of send traffic */ + l->stats.accu_queue_sz += skb_queue_len(&l->transmq); + l->stats.queue_sz_counts++; + + skb = skb_peek(&l->transmq); + if (!skb) + return; + msg = buf_msg(skb); + length = msg_size(msg); + + if (msg_user(msg) == MSG_FRAGMENTER) { + if (msg_type(msg) != FIRST_FRAGMENT) + return; + length = msg_size(msg_get_wrapped(msg)); + } + l->stats.msg_lengths_total += length; + l->stats.msg_length_counts++; + if (length <= 64) + l->stats.msg_length_profile[0]++; + else if (length <= 256) + l->stats.msg_length_profile[1]++; + else if (length <= 1024) + l->stats.msg_length_profile[2]++; + else if (length <= 4096) + l->stats.msg_length_profile[3]++; + else if (length <= 16384) + l->stats.msg_length_profile[4]++; + else if (length <= 32768) + l->stats.msg_length_profile[5]++; + else + l->stats.msg_length_profile[6]++; +} + +/* tipc_link_timeout - perform periodic task as instructed from node timeout + */ +int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + int rc = 0; + + link_profile_stats(l); + if (l->silent_intv_cnt) + rc = tipc_link_fsm_evt(l, SILENCE_EVT, xmitq); + else if (link_working(l) && tipc_bclink_acks_missing(l->owner)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + l->silent_intv_cnt++; + return rc; +} + /** * link_schedule_user - schedule a message sender for wakeup after congestion * @link: congested link * @list: message that was attempted sent * Create pseudo msg to send back to user when congestion abates - * Only consumes message if there is an error + * Does not consume buffer list */ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) { @@ -347,8 +509,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) /* This really cannot happen... */ if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - tipc_link_reset(link); - goto err; + return -ENOBUFS; } /* Non-blocking sender: */ if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) @@ -358,15 +519,12 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, addr, addr, oport, 0, 0); if (!skb) - goto err; + return -ENOBUFS; TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); TIPC_SKB_CB(skb)->chain_imp = imp; skb_queue_tail(&link->wakeupq, skb); link->stats.link_congs++; return -ELINKCONG; -err: - __skb_queue_purge(list); - return -ENOBUFS; } /** @@ -388,8 +546,8 @@ void link_prepare_wakeup(struct tipc_link *l) if ((pnd[imp] + l->backlog[imp].len) >= lim) break; skb_unlink(skb, &l->wakeupq); - skb_queue_tail(&l->inputq, skb); - l->owner->inputq = &l->inputq; + skb_queue_tail(l->inputq, skb); + l->owner->inputq = l->inputq; l->owner->action_flags |= TIPC_MSG_EVT; } } @@ -436,21 +594,22 @@ void tipc_link_reset(struct tipc_link *l_ptr) msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); /* Link is down, accept any session */ - l_ptr->peer_session = INVALID_SESSION; + l_ptr->peer_session = WILDCARD_SESSION; /* Prepare for renewed mtu size negotiation */ l_ptr->mtu = l_ptr->advertised_mtu; - l_ptr->state = RESET_UNKNOWN; + l_ptr->state = TIPC_LINK_RESETTING; - if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET)) + if ((prev_state == TIPC_LINK_RESETTING) || + (prev_state == TIPC_LINK_ESTABLISHING)) return; - tipc_node_link_down(l_ptr->owner, l_ptr); + tipc_node_link_down(l_ptr->owner, l_ptr->bearer_id); tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { - l_ptr->flags |= LINK_FAILINGOVER; + l_ptr->exec_mode = TIPC_LINK_BLOCKED; l_ptr->failover_checkpt = l_ptr->rcv_nxt; pl->failover_pkts = FIRST_FAILOVER; pl->failover_checkpt = l_ptr->rcv_nxt; @@ -462,7 +621,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) __skb_queue_purge(&l_ptr->transmq); __skb_queue_purge(&l_ptr->deferdq); if (!owner->inputq) - owner->inputq = &l_ptr->inputq; + owner->inputq = l_ptr->inputq; skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; @@ -470,173 +629,32 @@ void tipc_link_reset(struct tipc_link *l_ptr) l_ptr->reasm_buf = NULL; l_ptr->rcv_unacked = 0; l_ptr->snd_nxt = 1; + l_ptr->rcv_nxt = 1; l_ptr->silent_intv_cnt = 0; + l_ptr->stats.recv_info = 0; l_ptr->stale_count = 0; link_reset_statistics(l_ptr); } -static void link_activate(struct tipc_link *link) +void tipc_link_activate(struct tipc_link *link) { struct tipc_node *node = link->owner; link->rcv_nxt = 1; link->stats.recv_info = 1; link->silent_intv_cnt = 0; - tipc_node_link_up(node, link); + link->state = TIPC_LINK_WORKING; + link->exec_mode = TIPC_LINK_OPEN; + tipc_node_link_up(node, link->bearer_id); tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); } /** - * link_state_event - link finite state machine - * @l_ptr: pointer to link - * @event: state machine event to process - */ -static void link_state_event(struct tipc_link *l_ptr, unsigned int event) -{ - struct tipc_link *other; - unsigned long timer_intv = l_ptr->keepalive_intv; - - if (l_ptr->flags & LINK_STOPPED) - return; - - if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) - return; /* Not yet. */ - - if (l_ptr->flags & LINK_FAILINGOVER) - return; - - switch (l_ptr->state) { - case WORKING_WORKING: - switch (event) { - case TRAFFIC_MSG_EVT: - case ACTIVATE_MSG: - l_ptr->silent_intv_cnt = 0; - break; - case SILENCE_EVT: - if (!l_ptr->silent_intv_cnt) { - if (tipc_bclink_acks_missing(l_ptr->owner)) - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0); - break; - } - l_ptr->state = WORKING_UNKNOWN; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - break; - case RESET_MSG: - pr_debug("%s<%s>, requested by peer\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_RESET; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - break; - default: - pr_debug("%s%u in WW state\n", link_unk_evt, event); - } - break; - case WORKING_UNKNOWN: - switch (event) { - case TRAFFIC_MSG_EVT: - case ACTIVATE_MSG: - l_ptr->state = WORKING_WORKING; - l_ptr->silent_intv_cnt = 0; - break; - case RESET_MSG: - pr_debug("%s<%s>, requested by peer while probing\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_RESET; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - break; - case SILENCE_EVT: - if (!l_ptr->silent_intv_cnt) { - l_ptr->state = WORKING_WORKING; - if (tipc_bclink_acks_missing(l_ptr->owner)) - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0); - } else if (l_ptr->silent_intv_cnt < - l_ptr->abort_limit) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0); - } else { /* Link has failed */ - pr_debug("%s<%s>, peer not responding\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_UNKNOWN; - tipc_link_proto_xmit(l_ptr, RESET_MSG, - 0, 0, 0, 0); - } - break; - default: - pr_err("%s%u in WU state\n", link_unk_evt, event); - } - break; - case RESET_UNKNOWN: - switch (event) { - case TRAFFIC_MSG_EVT: - break; - case ACTIVATE_MSG: - other = l_ptr->owner->active_links[0]; - if (other && link_working_unknown(other)) - break; - l_ptr->state = WORKING_WORKING; - link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - if (l_ptr->owner->working_links == 1) - tipc_link_sync_xmit(l_ptr); - break; - case RESET_MSG: - l_ptr->state = RESET_RESET; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 1, 0, 0, 0); - break; - case STARTING_EVT: - l_ptr->flags |= LINK_STARTED; - link_set_timer(l_ptr, timer_intv); - break; - case SILENCE_EVT: - tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - break; - default: - pr_err("%s%u in RU state\n", link_unk_evt, event); - } - break; - case RESET_RESET: - switch (event) { - case TRAFFIC_MSG_EVT: - case ACTIVATE_MSG: - other = l_ptr->owner->active_links[0]; - if (other && link_working_unknown(other)) - break; - l_ptr->state = WORKING_WORKING; - link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - if (l_ptr->owner->working_links == 1) - tipc_link_sync_xmit(l_ptr); - break; - case RESET_MSG: - break; - case SILENCE_EVT: - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - break; - default: - pr_err("%s%u in RR state\n", link_unk_evt, event); - } - break; - default: - pr_err("Unknown link state %u/%u\n", l_ptr->state, event); - } -} - -/** * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked * @link: link to use * @list: chain of buffers containing message * - * Consumes the buffer chain, except when returning -ELINKCONG, - * since the caller then may want to make more send attempts. + * Consumes the buffer chain, except when returning an error code, * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ @@ -660,10 +678,9 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, if (unlikely(link->backlog[i].len >= link->backlog[i].limit)) return link_schedule_user(link, list); } - if (unlikely(msg_size(msg) > mtu)) { - __skb_queue_purge(list); + if (unlikely(msg_size(msg) > mtu)) return -EMSGSIZE; - } + /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { skb = skb_peek(list); @@ -700,101 +717,90 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, return 0; } -static void skb2list(struct sk_buff *skb, struct sk_buff_head *list) -{ - skb_queue_head_init(list); - __skb_queue_tail(list, skb); -} - -static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) -{ - struct sk_buff_head head; - - skb2list(skb, &head); - return __tipc_link_xmit(link->owner->net, link, &head); -} - -/* tipc_link_xmit_skb(): send single buffer to destination - * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE - * messages, which will not be rejected - * The only exception is datagram messages rerouted after secondary - * lookup, which are rare and safe to dispose of anyway. - * TODO: Return real return value, and let callers use - * tipc_wait_for_sendpkt() where applicable - */ -int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, - u32 selector) -{ - struct sk_buff_head head; - int rc; - - skb2list(skb, &head); - rc = tipc_link_xmit(net, &head, dnode, selector); - if (rc == -ELINKCONG) - kfree_skb(skb); - return 0; -} - /** - * tipc_link_xmit() is the general link level function for message sending - * @net: the applicable net namespace + * tipc_link_xmit(): enqueue buffer list according to queue situation + * @link: link to use * @list: chain of buffers containing message - * @dsz: amount of user data to be sent - * @dnode: address of destination node - * @selector: a number used for deterministic link selection - * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + * @xmitq: returned list of packets to be sent by caller + * + * Consumes the buffer chain, except when returning -ELINKCONG, + * since the caller then may want to make more send attempts. + * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS + * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ -int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, - u32 selector) +int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, + struct sk_buff_head *xmitq) { - struct tipc_link *link = NULL; - struct tipc_node *node; - int rc = -EHOSTUNREACH; + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + unsigned int maxwin = l->window; + unsigned int i, imp = msg_importance(hdr); + unsigned int mtu = l->mtu; + u16 ack = l->rcv_nxt - 1; + u16 seqno = l->snd_nxt; + u16 bc_last_in = l->owner->bclink.last_in; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff *skb, *_skb, *bskb; - node = tipc_node_find(net, dnode); - if (node) { - tipc_node_lock(node); - link = node->active_links[selector & 1]; - if (link) - rc = __tipc_link_xmit(net, link, list); - tipc_node_unlock(node); - tipc_node_put(node); + /* Match msg importance against this and all higher backlog limits: */ + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) + return link_schedule_user(l, list); } - if (link) - return rc; + if (unlikely(msg_size(hdr) > mtu)) + return -EMSGSIZE; - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } + /* Prepare each packet for sending, and add to relevant queue: */ + while (skb_queue_len(list)) { + skb = skb_peek(list); + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_last_in); - __skb_queue_purge(list); - return rc; + if (likely(skb_queue_len(transmq) < maxwin)) { + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) + return -ENOBUFS; + __skb_dequeue(list); + __skb_queue_tail(transmq, skb); + __skb_queue_tail(xmitq, _skb); + l->rcv_unacked = 0; + seqno++; + continue; + } + if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) { + kfree_skb(__skb_dequeue(list)); + l->stats.sent_bundled++; + continue; + } + if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) { + kfree_skb(__skb_dequeue(list)); + __skb_queue_tail(backlogq, bskb); + l->backlog[msg_importance(buf_msg(bskb))].len++; + l->stats.sent_bundled++; + l->stats.sent_bundles++; + continue; + } + l->backlog[imp].len += skb_queue_len(list); + skb_queue_splice_tail_init(list, backlogq); + } + l->snd_nxt = seqno; + return 0; } -/* - * tipc_link_sync_xmit - synchronize broadcast link endpoints. - * - * Give a newly added peer node the sequence number where it should - * start receiving and acking broadcast packets. - * - * Called with node locked - */ -static void tipc_link_sync_xmit(struct tipc_link *link) +static void skb2list(struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff *skb; - struct tipc_msg *msg; + skb_queue_head_init(list); + __skb_queue_tail(list, skb); +} - skb = tipc_buf_acquire(INT_H_SIZE); - if (!skb) - return; +static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) +{ + struct sk_buff_head head; - msg = buf_msg(skb); - tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG, - INT_H_SIZE, link->addr); - msg_set_last_bcast(msg, link->owner->bclink.acked); - __tipc_link_xmit_skb(link, skb); + skb2list(skb, &head); + return __tipc_link_xmit(link->owner->net, link, &head); } /* @@ -847,6 +853,34 @@ void tipc_link_push_packets(struct tipc_link *link) link->snd_nxt = seqno; } +void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + struct sk_buff *skb, *_skb; + struct tipc_msg *hdr; + u16 seqno = l->snd_nxt; + u16 ack = l->rcv_nxt - 1; + + while (skb_queue_len(&l->transmq) < l->window) { + skb = skb_peek(&l->backlogq); + if (!skb) + break; + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) + break; + __skb_dequeue(&l->backlogq); + hdr = buf_msg(skb); + l->backlog[msg_importance(hdr)].len--; + __skb_queue_tail(&l->transmq, skb); + __skb_queue_tail(xmitq, _skb); + msg_set_ack(hdr, ack); + msg_set_seqno(hdr, seqno); + msg_set_bcast_ack(hdr, l->owner->bclink.last_in); + l->rcv_unacked = 0; + seqno++; + } + l->snd_nxt = seqno; +} + void tipc_link_reset_all(struct tipc_node *node) { char addr_string[16]; @@ -858,9 +892,9 @@ void tipc_link_reset_all(struct tipc_node *node) tipc_addr_string_fill(addr_string, node->addr)); for (i = 0; i < MAX_BEARERS; i++) { - if (node->links[i]) { - link_print(node->links[i], "Resetting link\n"); - tipc_link_reset(node->links[i]); + if (node->links[i].link) { + link_print(node->links[i].link, "Resetting link\n"); + tipc_link_reset(node->links[i].link); } } @@ -877,9 +911,13 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, if (l_ptr->addr) { /* Handle failure on standard link */ - link_print(l_ptr, "Resetting link\n"); + link_print(l_ptr, "Resetting link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(msg), msg_type(msg), msg_size(msg), + msg_errcode(msg)); + pr_info("sqno %u, prev: %x, src: %x\n", + msg_seqno(msg), msg_prevnode(msg), msg_orignode(msg)); tipc_link_reset(l_ptr); - } else { /* Handle failure on broadcast link */ struct tipc_node *n_ptr; @@ -940,6 +978,41 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, } } +static int tipc_link_retransm(struct tipc_link *l, int retransm, + struct sk_buff_head *xmitq) +{ + struct sk_buff *_skb, *skb = skb_peek(&l->transmq); + struct tipc_msg *hdr; + + if (!skb) + return 0; + + /* Detect repeated retransmit failures on same packet */ + if (likely(l->last_retransm != buf_seqno(skb))) { + l->last_retransm = buf_seqno(skb); + l->stale_count = 1; + } else if (++l->stale_count > 100) { + link_retransmit_failure(l, skb); + return TIPC_LINK_DOWN_EVT; + } + skb_queue_walk(&l->transmq, skb) { + if (!retransm) + return 0; + hdr = buf_msg(skb); + _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + if (!_skb) + return 0; + hdr = buf_msg(_skb); + msg_set_ack(hdr, l->rcv_nxt - 1); + msg_set_bcast_ack(hdr, l->owner->bclink.last_in); + _skb->priority = TC_PRIO_CONTROL; + __skb_queue_tail(xmitq, _skb); + retransm--; + l->stats.retransmitted++; + } + return 0; +} + /* link_synch(): check if all packets arrived before the synch * point have been consumed * Returns true if the parallel links are synched, otherwise false @@ -959,168 +1032,13 @@ static bool link_synch(struct tipc_link *l) /* Is it still in the input queue ? */ post_synch = mod(pl->rcv_nxt - l->synch_point) - 1; - if (skb_queue_len(&pl->inputq) > post_synch) + if (skb_queue_len(pl->inputq) > post_synch) return false; synched: - l->flags &= ~LINK_SYNCHING; + l->exec_mode = TIPC_LINK_OPEN; return true; } -static void link_retrieve_defq(struct tipc_link *link, - struct sk_buff_head *list) -{ - u16 seq_no; - - if (skb_queue_empty(&link->deferdq)) - return; - - seq_no = buf_seqno(skb_peek(&link->deferdq)); - if (seq_no == link->rcv_nxt) - skb_queue_splice_tail_init(&link->deferdq, list); -} - -/** - * tipc_rcv - process TIPC packets/messages arriving from off-node - * @net: the applicable net namespace - * @skb: TIPC packet - * @b_ptr: pointer to bearer message arrived on - * - * Invoked with no locks held. Bearer pointer must point to a valid bearer - * structure (i.e. cannot be NULL), but bearer can be inactive. - */ -void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff_head head; - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; - struct sk_buff *skb1, *tmp; - struct tipc_msg *msg; - u16 seq_no; - u16 ackd; - u32 released; - - skb2list(skb, &head); - - while ((skb = __skb_dequeue(&head))) { - /* Ensure message is well-formed */ - if (unlikely(!tipc_msg_validate(skb))) - goto discard; - - /* Handle arrival of a non-unicast link message */ - msg = buf_msg(skb); - if (unlikely(msg_non_seq(msg))) { - if (msg_user(msg) == LINK_CONFIG) - tipc_disc_rcv(net, skb, b_ptr); - else - tipc_bclink_rcv(net, skb); - continue; - } - - /* Discard unicast link messages destined for another node */ - if (unlikely(!msg_short(msg) && - (msg_destnode(msg) != tn->own_addr))) - goto discard; - - /* Locate neighboring node that sent message */ - n_ptr = tipc_node_find(net, msg_prevnode(msg)); - if (unlikely(!n_ptr)) - goto discard; - - tipc_node_lock(n_ptr); - /* Locate unicast link endpoint that should handle message */ - l_ptr = n_ptr->links[b_ptr->identity]; - if (unlikely(!l_ptr)) - goto unlock; - - /* Verify that communication with node is currently allowed */ - if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) && - msg_user(msg) == LINK_PROTOCOL && - (msg_type(msg) == RESET_MSG || - msg_type(msg) == ACTIVATE_MSG) && - !msg_redundant_link(msg)) - n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN; - - if (tipc_node_blocked(n_ptr)) - goto unlock; - - /* Validate message sequence number info */ - seq_no = msg_seqno(msg); - ackd = msg_ack(msg); - - /* Release acked messages */ - if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg))) - tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); - - released = 0; - skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) { - if (more(buf_seqno(skb1), ackd)) - break; - __skb_unlink(skb1, &l_ptr->transmq); - kfree_skb(skb1); - released = 1; - } - - /* Try sending any messages link endpoint has pending */ - if (unlikely(skb_queue_len(&l_ptr->backlogq))) - tipc_link_push_packets(l_ptr); - - if (released && !skb_queue_empty(&l_ptr->wakeupq)) - link_prepare_wakeup(l_ptr); - - /* Process the incoming packet */ - if (unlikely(!link_working_working(l_ptr))) { - if (msg_user(msg) == LINK_PROTOCOL) { - tipc_link_proto_rcv(l_ptr, skb); - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; - } - - /* Traffic message. Conditionally activate link */ - link_state_event(l_ptr, TRAFFIC_MSG_EVT); - - if (link_working_working(l_ptr)) { - /* Re-insert buffer in front of queue */ - __skb_queue_head(&head, skb); - skb = NULL; - goto unlock; - } - goto unlock; - } - - /* Link is now in state WORKING_WORKING */ - if (unlikely(seq_no != l_ptr->rcv_nxt)) { - link_handle_out_of_seq_msg(l_ptr, skb); - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; - } - l_ptr->silent_intv_cnt = 0; - - /* Synchronize with parallel link if applicable */ - if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { - if (!link_synch(l_ptr)) - goto unlock; - } - l_ptr->rcv_nxt++; - if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) - link_retrieve_defq(l_ptr, &head); - if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { - l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - } - tipc_link_input(l_ptr, skb); - skb = NULL; -unlock: - tipc_node_unlock(n_ptr); - tipc_node_put(n_ptr); -discard: - if (unlikely(skb)) - kfree_skb(skb); - } -} - /* tipc_data_input - deliver data and name distr msgs to upper layer * * Consumes buffer if message is of right type @@ -1138,16 +1056,16 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: case CONN_MANAGER: - if (tipc_skb_queue_tail(&link->inputq, skb, dport)) { - node->inputq = &link->inputq; + if (tipc_skb_queue_tail(link->inputq, skb, dport)) { + node->inputq = link->inputq; node->action_flags |= TIPC_MSG_EVT; } return true; case NAME_DISTRIBUTOR: node->bclink.recv_permitted = true; - node->namedq = &link->namedq; - skb_queue_tail(&link->namedq, skb); - if (skb_queue_len(&link->namedq) == 1) + node->namedq = link->namedq; + skb_queue_tail(link->namedq, skb); + if (skb_queue_len(link->namedq) == 1) node->action_flags |= TIPC_NAMED_MSG_EVT; return true; case MSG_BUNDLER: @@ -1174,13 +1092,10 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) struct sk_buff *iskb; int pos = 0; - if (likely(tipc_data_input(link, skb))) - return; - switch (msg_user(msg)) { case TUNNEL_PROTOCOL: if (msg_dup(msg)) { - link->flags |= LINK_SYNCHING; + link->exec_mode = TIPC_LINK_TUNNEL; link->synch_point = msg_seqno(msg_get_wrapped(msg)); kfree_skb(skb); break; @@ -1215,6 +1130,110 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) }; } +static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) +{ + bool released = false; + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&l->transmq, skb, tmp) { + if (more(buf_seqno(skb), acked)) + break; + __skb_unlink(skb, &l->transmq); + kfree_skb(skb); + released = true; + } + return released; +} + +/* tipc_link_rcv - process TIPC packets/messages arriving from off-node + * @link: the link that should handle the message + * @skb: TIPC packet + * @xmitq: queue to place packets to be sent after this call + */ +int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + struct sk_buff_head *arrvq = &l->deferdq; + struct sk_buff *tmp; + struct tipc_msg *hdr; + u16 seqno, rcv_nxt; + int rc = 0; + + if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) { + if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, + 0, 0, 0, xmitq); + return rc; + } + + skb_queue_walk_safe(arrvq, skb, tmp) { + hdr = buf_msg(skb); + + /* Verify and update link state */ + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) { + __skb_dequeue(arrvq); + rc |= tipc_link_proto_rcv(l, skb, xmitq); + continue; + } + + if (unlikely(!link_working(l))) { + rc |= tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); + if (!link_working(l)) { + kfree_skb(__skb_dequeue(arrvq)); + return rc; + } + } + + l->silent_intv_cnt = 0; + + /* Forward queues and wake up waiting users */ + if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } + + /* Defer reception if there is a gap in the sequence */ + seqno = msg_seqno(hdr); + rcv_nxt = l->rcv_nxt; + if (unlikely(less(rcv_nxt, seqno))) { + l->stats.deferred_recv++; + return rc; + } + + __skb_dequeue(arrvq); + + /* Drop if packet already received */ + if (unlikely(more(rcv_nxt, seqno))) { + l->stats.duplicates++; + kfree_skb(skb); + return rc; + } + + /* Synchronize with parallel link if applicable */ + if (unlikely(l->exec_mode == TIPC_LINK_TUNNEL)) + if (!msg_dup(hdr) && !link_synch(l)) { + kfree_skb(skb); + return rc; + } + + /* Packet can be delivered */ + l->rcv_nxt++; + l->stats.recv_info++; + if (unlikely(!tipc_data_input(l, skb))) + tipc_link_input(l, skb); + + /* Ack at regular intervals */ + if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { + l->rcv_unacked = 0; + l->stats.sent_acks++; + tipc_link_build_proto_msg(l, STATE_MSG, + 0, 0, 0, 0, xmitq); + } + } + return rc; +} + /** * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue * @@ -1255,235 +1274,85 @@ u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) } /* - * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet + * Send protocol message to the other endpoint. */ -static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, - struct sk_buff *buf) +void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg, + u32 gap, u32 tolerance, u32 priority) { - u32 seq_no = buf_seqno(buf); - - if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) { - tipc_link_proto_rcv(l_ptr, buf); - return; - } - - /* Record OOS packet arrival */ - l_ptr->silent_intv_cnt = 0; + struct sk_buff *skb = NULL; + struct sk_buff_head xmitq; - /* - * Discard packet if a duplicate; otherwise add it to deferred queue - * and notify peer of gap as per protocol specification - */ - if (less(seq_no, l_ptr->rcv_nxt)) { - l_ptr->stats.duplicates++; - kfree_skb(buf); + __skb_queue_head_init(&xmitq); + tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap, + tolerance, priority, &xmitq); + skb = __skb_dequeue(&xmitq); + if (!skb) return; - } - - if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) { - l_ptr->stats.deferred_recv++; - if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - } else { - l_ptr->stats.duplicates++; - } + tipc_bearer_send(l->owner->net, l->bearer_id, skb, &l->media_addr); + l->rcv_unacked = 0; + kfree_skb(skb); } -/* - * Send protocol message to the other endpoint. +/* tipc_link_build_proto_msg: prepare link protocol message for transmission */ -void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, - u32 gap, u32 tolerance, u32 priority) +static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, + u16 rcvgap, int tolerance, int priority, + struct sk_buff_head *xmitq) { - struct sk_buff *buf = NULL; - struct tipc_msg *msg = l_ptr->pmsg; - u32 msg_size = sizeof(l_ptr->proto_msg); - int r_flag; - u16 last_rcv; - - /* Don't send protocol message during link failover */ - if (l_ptr->flags & LINK_FAILINGOVER) - return; - - /* Abort non-RESET send if communication with node is prohibited */ - if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG)) + struct sk_buff *skb = NULL; + struct tipc_msg *hdr = l->pmsg; + u16 snd_nxt = l->snd_nxt; + u16 rcv_nxt = l->rcv_nxt; + u16 rcv_last = rcv_nxt - 1; + int node_up = l->owner->bclink.recv_permitted; + + /* Don't send protocol message during reset or link failover */ + if (l->exec_mode == TIPC_LINK_BLOCKED) return; - /* Create protocol message with "out-of-sequence" sequence number */ - msg_set_type(msg, msg_typ); - msg_set_net_plane(msg, l_ptr->net_plane); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net)); + msg_set_type(hdr, mtyp); + msg_set_net_plane(hdr, l->net_plane); + msg_set_bcast_ack(hdr, l->owner->bclink.last_in); + msg_set_last_bcast(hdr, tipc_bclink_get_last_sent(l->owner->net)); + msg_set_link_tolerance(hdr, tolerance); + msg_set_linkprio(hdr, priority); + msg_set_redundant_link(hdr, node_up); + msg_set_seq_gap(hdr, 0); - if (msg_typ == STATE_MSG) { - u16 next_sent = l_ptr->snd_nxt; + /* Compatibility: created msg must not be in sequence with pkt flow */ + msg_set_seqno(hdr, snd_nxt + U16_MAX / 2); - if (!tipc_link_is_up(l_ptr)) + if (mtyp == STATE_MSG) { + if (!tipc_link_is_up(l)) return; - msg_set_next_sent(msg, next_sent); - if (!skb_queue_empty(&l_ptr->deferdq)) { - last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq)); - gap = mod(last_rcv - l_ptr->rcv_nxt); + msg_set_next_sent(hdr, snd_nxt); + + /* Override rcvgap if there are packets in deferred queue */ + if (!skb_queue_empty(&l->deferdq)) + rcvgap = buf_seqno(skb_peek(&l->deferdq)) - rcv_nxt; + if (rcvgap) { + msg_set_seq_gap(hdr, rcvgap); + l->stats.sent_nacks++; } - msg_set_seq_gap(msg, gap); - if (gap) - l_ptr->stats.sent_nacks++; - msg_set_link_tolerance(msg, tolerance); - msg_set_linkprio(msg, priority); - msg_set_max_pkt(msg, l_ptr->mtu); - msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); - msg_set_probe(msg, probe_msg != 0); - if (probe_msg) - l_ptr->stats.sent_probes++; - l_ptr->stats.sent_states++; - } else { /* RESET_MSG or ACTIVATE_MSG */ - msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1)); - msg_set_seq_gap(msg, 0); - msg_set_next_sent(msg, 1); - msg_set_probe(msg, 0); - msg_set_link_tolerance(msg, l_ptr->tolerance); - msg_set_linkprio(msg, l_ptr->priority); - msg_set_max_pkt(msg, l_ptr->advertised_mtu); + msg_set_ack(hdr, rcv_last); + msg_set_probe(hdr, probe); + if (probe) + l->stats.sent_probes++; + l->stats.sent_states++; + } else { + /* RESET_MSG or ACTIVATE_MSG */ + msg_set_max_pkt(hdr, l->advertised_mtu); + msg_set_ack(hdr, l->failover_checkpt - 1); + msg_set_next_sent(hdr, 1); } - - r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr)); - msg_set_redundant_link(msg, r_flag); - msg_set_linkprio(msg, l_ptr->priority); - msg_set_size(msg, msg_size); - - msg_set_seqno(msg, mod(l_ptr->snd_nxt + (0xffff / 2))); - - buf = tipc_buf_acquire(msg_size); - if (!buf) + skb = tipc_buf_acquire(msg_size(hdr)); + if (!skb) return; - - skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); - buf->priority = TC_PRIO_CONTROL; - tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf, - &l_ptr->media_addr); - l_ptr->rcv_unacked = 0; - kfree_skb(buf); + skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); + skb->priority = TC_PRIO_CONTROL; + __skb_queue_head(xmitq, skb); } -/* - * Receive protocol message : - * Note that network plane id propagates through the network, and may - * change at any time. The node with lowest address rules - */ -static void tipc_link_proto_rcv(struct tipc_link *l_ptr, - struct sk_buff *buf) -{ - u32 rec_gap = 0; - u32 msg_tol; - struct tipc_msg *msg = buf_msg(buf); - - if (l_ptr->flags & LINK_FAILINGOVER) - goto exit; - - if (l_ptr->net_plane != msg_net_plane(msg)) - if (link_own_addr(l_ptr) > msg_prevnode(msg)) - l_ptr->net_plane = msg_net_plane(msg); - - switch (msg_type(msg)) { - - case RESET_MSG: - if (!link_working_unknown(l_ptr) && - (l_ptr->peer_session != INVALID_SESSION)) { - if (less_eq(msg_session(msg), l_ptr->peer_session)) - break; /* duplicate or old reset: ignore */ - } - - if (!msg_redundant_link(msg) && (link_working_working(l_ptr) || - link_working_unknown(l_ptr))) { - /* - * peer has lost contact -- don't allow peer's links - * to reactivate before we recognize loss & clean up - */ - l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN; - } - - link_state_event(l_ptr, RESET_MSG); - - /* fall thru' */ - case ACTIVATE_MSG: - /* Update link settings according other endpoint's values */ - strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg)); - - msg_tol = msg_link_tolerance(msg); - if (msg_tol > l_ptr->tolerance) - link_set_supervision_props(l_ptr, msg_tol); - - if (msg_linkprio(msg) > l_ptr->priority) - l_ptr->priority = msg_linkprio(msg); - - if (l_ptr->mtu > msg_max_pkt(msg)) - l_ptr->mtu = msg_max_pkt(msg); - - /* Synchronize broadcast link info, if not done previously */ - if (!tipc_node_is_up(l_ptr->owner)) { - l_ptr->owner->bclink.last_sent = - l_ptr->owner->bclink.last_in = - msg_last_bcast(msg); - l_ptr->owner->bclink.oos_state = 0; - } - - l_ptr->peer_session = msg_session(msg); - l_ptr->peer_bearer_id = msg_bearer_id(msg); - - if (msg_type(msg) == ACTIVATE_MSG) - link_state_event(l_ptr, ACTIVATE_MSG); - break; - case STATE_MSG: - - msg_tol = msg_link_tolerance(msg); - if (msg_tol) - link_set_supervision_props(l_ptr, msg_tol); - - if (msg_linkprio(msg) && - (msg_linkprio(msg) != l_ptr->priority)) { - pr_debug("%s<%s>, priority change %u->%u\n", - link_rst_msg, l_ptr->name, - l_ptr->priority, msg_linkprio(msg)); - l_ptr->priority = msg_linkprio(msg); - tipc_link_reset(l_ptr); /* Enforce change to take effect */ - break; - } - - /* Record reception; force mismatch at next timeout: */ - l_ptr->silent_intv_cnt = 0; - - link_state_event(l_ptr, TRAFFIC_MSG_EVT); - l_ptr->stats.recv_states++; - if (link_reset_unknown(l_ptr)) - break; - - if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg))) - rec_gap = mod(msg_next_sent(msg) - l_ptr->rcv_nxt); - - if (msg_probe(msg)) - l_ptr->stats.recv_probes++; - - /* Protocol message before retransmits, reduce loss risk */ - if (l_ptr->owner->bclink.recv_permitted) - tipc_bclink_update_link_state(l_ptr->owner, - msg_last_bcast(msg)); - - if (rec_gap || (msg_probe(msg))) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, - rec_gap, 0, 0); - } - if (msg_seq_gap(msg)) { - l_ptr->stats.recv_nacks++; - tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq), - msg_seq_gap(msg)); - } - break; - } -exit: - kfree_skb(buf); -} - - /* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to * a different bearer. Owner node is locked. */ @@ -1496,7 +1365,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, struct sk_buff *skb; u32 length = msg_size(msg); - tunnel = l_ptr->owner->active_links[selector & 1]; + tunnel = node_active_link(l_ptr->owner, selector & 1); if (!tipc_link_is_up(tunnel)) { pr_warn("%stunnel link no longer available\n", link_co_err); return; @@ -1522,7 +1391,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, void tipc_link_failover_send_queue(struct tipc_link *l_ptr) { int msgcount; - struct tipc_link *tunnel = l_ptr->owner->active_links[0]; + struct tipc_link *tunnel = node_active_link(l_ptr->owner, 0); struct tipc_msg tunnel_hdr; struct sk_buff *skb; int split_bundles; @@ -1556,8 +1425,8 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) return; } - split_bundles = (l_ptr->owner->active_links[0] != - l_ptr->owner->active_links[1]); + split_bundles = (node_active_link(l_ptr->owner, 0) != + node_active_link(l_ptr->owner, 0)); skb_queue_walk(&l_ptr->transmq, skb) { struct tipc_msg *msg = buf_msg(skb); @@ -1660,7 +1529,7 @@ static bool tipc_link_failover_rcv(struct tipc_link *link, if (bearer_id == link->bearer_id) goto exit; - pl = link->owner->links[bearer_id]; + pl = link->owner->links[bearer_id].link; if (pl && tipc_link_is_up(pl)) tipc_link_reset(pl); @@ -1691,22 +1560,100 @@ static bool tipc_link_failover_rcv(struct tipc_link *link, } exit: if (!link->failover_pkts && pl) - pl->flags &= ~LINK_FAILINGOVER; + pl->exec_mode = TIPC_LINK_OPEN; kfree_skb(*skb); *skb = iskb; return *skb; } -static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) +/* tipc_link_proto_rcv(): receive link level protocol message : + * Note that network plane id propagates through the network, and may + * change at any time. The node with lowest numerical id determines + * network plane + */ +static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; + struct tipc_msg *hdr = buf_msg(skb); + u16 rcvgap = 0; + u16 nacked_gap = msg_seq_gap(hdr); + u16 peers_snd_nxt = msg_next_sent(hdr); + u16 peers_tol = msg_link_tolerance(hdr); + u16 peers_prio = msg_linkprio(hdr); + char *if_name; + int rc = 0; - if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL)) - return; + if (l->exec_mode == TIPC_LINK_BLOCKED) + goto exit; + + if (link_own_addr(l) > msg_prevnode(hdr)) + l->net_plane = msg_net_plane(hdr); + + switch (msg_type(hdr)) { + case RESET_MSG: + + /* Ignore duplicate RESET with old session number */ + if ((less_eq(msg_session(hdr), l->peer_session)) && + (l->peer_session != WILDCARD_SESSION)) + break; + /* fall thru' */ + case ACTIVATE_MSG: + + /* Complete own link name with peer's interface name */ + if_name = strrchr(l->name, ':') + 1; + if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) + break; + if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) + break; + strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); + + /* Update own tolerance if peer indicates a non-zero value */ + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + l->tolerance = peers_tol; - l_ptr->tolerance = tol; - l_ptr->keepalive_intv = msecs_to_jiffies(intv); - l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->keepalive_intv)); + /* Update own priority if peer's priority is higher */ + if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) + l->priority = peers_prio; + + l->peer_session = msg_session(hdr); + l->peer_bearer_id = msg_bearer_id(hdr); + rc = tipc_link_fsm_evt(l, msg_type(hdr), xmitq); + if (l->mtu > msg_max_pkt(hdr)) + l->mtu = msg_max_pkt(hdr); + break; + case STATE_MSG: + /* Update own tolerance if peer indicates a non-zero value */ + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + l->tolerance = peers_tol; + + l->silent_intv_cnt = 0; + l->stats.recv_states++; + if (msg_probe(hdr)) + l->stats.recv_probes++; + rc = tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); + if (!tipc_link_is_up(l)) + break; + + /* Has peer sent packets we haven't received yet ? */ + if (more(peers_snd_nxt, l->rcv_nxt)) + rcvgap = peers_snd_nxt - l->rcv_nxt; + if (rcvgap || (msg_probe(hdr))) + tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, + 0, 0, xmitq); + tipc_link_release_pkts(l, msg_ack(hdr)); + + /* If NACK, retransmit will now start at right position */ + if (nacked_gap) { + rc |= tipc_link_retransm(l, nacked_gap, xmitq); + l->stats.recv_nacks++; + } + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } +exit: + kfree_skb(skb); + return rc; } void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) @@ -1743,7 +1690,7 @@ static struct tipc_node *tipc_link_find_owner(struct net *net, list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { tipc_node_lock(n_ptr); for (i = 0; i < MAX_BEARERS; i++) { - l_ptr = n_ptr->links[i]; + l_ptr = n_ptr->links[i].link; if (l_ptr && !strcmp(l_ptr->name, link_name)) { *bearer_id = i; found_node = n_ptr; @@ -1770,27 +1717,28 @@ static void link_reset_statistics(struct tipc_link *l_ptr) l_ptr->stats.recv_info = l_ptr->rcv_nxt; } -static void link_print(struct tipc_link *l_ptr, const char *str) +static void link_print(struct tipc_link *l, const char *str) { - struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id); - struct tipc_bearer *b_ptr; - - rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]); - if (b_ptr) - pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name); - rcu_read_unlock(); - - if (link_working_unknown(l_ptr)) - pr_cont(":WU\n"); - else if (link_reset_reset(l_ptr)) - pr_cont(":RR\n"); - else if (link_reset_unknown(l_ptr)) - pr_cont(":RU\n"); - else if (link_working_working(l_ptr)) - pr_cont(":WW\n"); + struct sk_buff *hskb = skb_peek(&l->transmq); + u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt; + u16 tail = l->snd_nxt - 1; + + pr_info("%s Link <%s>:", str, l->name); + + if (link_probing(l)) + pr_cont(":P\n"); + else if (link_establishing(l)) + pr_cont(":E\n"); + else if (link_resetting(l)) + pr_cont(":R\n"); + else if (link_working(l)) + pr_cont(":W\n"); else pr_cont("\n"); + + pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n", + skb_queue_len(&l->transmq), head, tail, + skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt); } /* Parse and validate nested (link) properties valid for media, bearer and link @@ -1865,7 +1813,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; @@ -1885,7 +1833,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - link_set_supervision_props(link, tol); + link->tolerance = tol; tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0); } if (props[TIPC_NLA_PROP_PRIO]) { @@ -2055,10 +2003,11 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; - if (!node->links[i]) + if (!node->links[i].link) continue; - err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI); + err = __tipc_nl_add_link(net, msg, + node->links[i].link, NLM_F_MULTI); if (err) return err; } @@ -2172,7 +2121,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) return -EINVAL; tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { tipc_node_unlock(node); nlmsg_free(msg.skb); @@ -2227,7 +2176,7 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { tipc_node_unlock(node); return -EINVAL; diff --git a/net/tipc/link.h b/net/tipc/link.h index ae0a0ea..37cfd7d 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -49,19 +49,21 @@ */ #define INVALID_LINK_SEQ 0x10000 -/* Link working states + +/* Link endpoint receive states */ -#define WORKING_WORKING 560810u -#define WORKING_UNKNOWN 560811u -#define RESET_UNKNOWN 560812u -#define RESET_RESET 560813u +enum { + TIPC_LINK_OPEN, + TIPC_LINK_BLOCKED, + TIPC_LINK_TUNNEL +}; -/* Link endpoint execution states +/* Events returned from link at packet reception or at timeout */ -#define LINK_STARTED 0x0001 -#define LINK_STOPPED 0x0002 -#define LINK_SYNCHING 0x0004 -#define LINK_FAILINGOVER 0x0008 +enum { + TIPC_LINK_UP_EVT = 1, + TIPC_LINK_DOWN_EVT = (1 << 1) +}; /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) @@ -106,7 +108,6 @@ struct tipc_stats { * @timer: link timer * @owner: pointer to peer node * @refcnt: reference counter for permanent references (owner node & timer) - * @flags: execution state flags for link endpoint instance * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link @@ -119,6 +120,7 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') + * @exec_mode: transmit/receive mode for link endpoint instance * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset @@ -144,12 +146,9 @@ struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; struct tipc_media_addr media_addr; - struct timer_list timer; struct tipc_node *owner; - struct kref ref; /* Management and link supervision data */ - unsigned int flags; u32 peer_session; u32 peer_bearer_id; u32 bearer_id; @@ -165,6 +164,7 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; + u8 exec_mode; u16 synch_point; /* Failover */ @@ -192,8 +192,8 @@ struct tipc_link { u16 rcv_nxt; u32 rcv_unacked; struct sk_buff_head deferdq; - struct sk_buff_head inputq; - struct sk_buff_head namedq; + struct sk_buff_head *inputq; + struct sk_buff_head *namedq; /* Congestion handling */ struct sk_buff_head wakeupq; @@ -207,9 +207,11 @@ struct tipc_link { struct tipc_port; -struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, - struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr); +struct tipc_link *tipc_link_create(struct tipc_node *n, + struct tipc_bearer *b, + const struct tipc_media_addr *maddr, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq); void tipc_link_delete(struct tipc_link *link); void tipc_link_delete_list(struct net *net, unsigned int bearer_id); void tipc_link_failover_send_queue(struct tipc_link *l_ptr); @@ -221,12 +223,11 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_purge_backlog(struct tipc_link *l); void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); -int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, - u32 selector); -int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, - u32 selector); +void tipc_link_activate(struct tipc_link *link); int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); +int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, + struct sk_buff_head *xmitq); void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, u32 gap, u32 tolerance, u32 priority); void tipc_link_push_packets(struct tipc_link *l_ptr); @@ -243,33 +244,12 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); void link_prepare_wakeup(struct tipc_link *l); - +int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); +int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); static inline u32 link_own_addr(struct tipc_link *l) { return msg_prevnode(l->pmsg); } -/* - * Link status checking routines - */ -static inline int link_working_working(struct tipc_link *l_ptr) -{ - return l_ptr->state == WORKING_WORKING; -} - -static inline int link_working_unknown(struct tipc_link *l_ptr) -{ - return l_ptr->state == WORKING_UNKNOWN; -} - -static inline int link_reset_unknown(struct tipc_link *l_ptr) -{ - return l_ptr->state == RESET_UNKNOWN; -} - -static inline int link_reset_reset(struct tipc_link *l_ptr) -{ - return l_ptr->state == RESET_RESET; -} - #endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 19c45fb..2f1563b 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -38,6 +38,7 @@ #define _TIPC_MSG_H #include <linux/tipc.h> +#include "core.h" /* * Constants and routines used to read and write TIPC payload message headers @@ -658,12 +659,12 @@ static inline void msg_set_link_selector(struct tipc_msg *m, u32 n) /* * Word 5 */ -static inline u32 msg_session(struct tipc_msg *m) +static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } -static inline void msg_set_session(struct tipc_msg *m, u32 n) +static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } @@ -766,6 +767,22 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } +static inline bool msg_is_traffic(struct tipc_msg *m) +{ + if (likely(msg_user(m) != LINK_PROTOCOL)) + return true; + if ((msg_type(m) == RESET_MSG) || (msg_type(m) == ACTIVATE_MSG)) + return false; + return true; +} + +static inline bool msg_peer_is_up(struct tipc_msg *m) +{ + if (likely(msg_is_traffic(m))) + return false; + return msg_redundant_link(m); +} + struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, @@ -879,4 +896,36 @@ static inline bool tipc_skb_queue_tail(struct sk_buff_head *list, return rv; } +/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number + * @list: list to be appended to + * @skb: buffer to add + * Returns true if queue should treated further, otherwise false + */ +static inline bool __tipc_skb_queue_sorted(struct sk_buff_head *list, + struct sk_buff *skb) +{ + struct sk_buff *_skb, *tmp; + struct tipc_msg *hdr = buf_msg(skb); + u16 seqno = msg_seqno(hdr); + + if (skb_queue_empty(list) || (msg_user(hdr) == LINK_PROTOCOL)) { + __skb_queue_head(list, skb); + return true; + } + if (likely(less(seqno, buf_seqno(skb_peek(list))))) { + __skb_queue_head(list, skb); + return true; + } + if (!more(seqno, buf_seqno(skb_peek_tail(list)))) { + skb_queue_walk_safe(list, _skb, tmp) { + if (likely(less(seqno, buf_seqno(_skb)))) { + __skb_queue_before(list, _skb, skb); + return true; + } + } + } + __skb_queue_tail(list, skb); + return false; +} + #endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 41e7b7e..e6018b7 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -96,13 +96,13 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb) dnode = node->addr; if (in_own_node(net, dnode)) continue; - if (!tipc_node_active_links(node)) + if (!tipc_node_is_up(node)) continue; oskb = pskb_copy(skb, GFP_ATOMIC); if (!oskb) break; msg_set_destnode(buf_msg(oskb), dnode); - tipc_link_xmit_skb(net, oskb, dnode, dnode); + tipc_node_xmit_skb(net, oskb, dnode, dnode); } rcu_read_unlock(); @@ -223,7 +223,7 @@ void tipc_named_node_up(struct net *net, u32 dnode) &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]); rcu_read_unlock(); - tipc_link_xmit(net, &head, dnode, dnode); + tipc_node_xmit(net, &head, dnode, dnode); } static void tipc_publ_subscribe(struct net *net, struct publication *publ, diff --git a/net/tipc/node.c b/net/tipc/node.c index 0b1d61a..e92f84a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -40,10 +40,13 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "discover.h" static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); static void tipc_node_delete(struct tipc_node *node); +static void tipc_node_timeout(unsigned long data); +static void tipc_node_fsm_evt(struct tipc_node *n, int evt); struct tipc_sock_conn { u32 port; @@ -132,6 +135,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->publ_list); INIT_LIST_HEAD(&n_ptr->conn_sks); + skb_queue_head_init(&n_ptr->bclink.namedq); __skb_queue_head_init(&n_ptr->bclink.deferdq); hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { @@ -139,14 +143,32 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) break; } list_add_tail_rcu(&n_ptr->list, &temp_node->list); - n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; + n_ptr->state = SELF_DOWN_PEER_LEAVING; n_ptr->signature = INVALID_NODE_SIG; + n_ptr->active_links[0] = INVALID_BEARER_ID; + n_ptr->active_links[1] = INVALID_BEARER_ID; tipc_node_get(n_ptr); + setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr); + n_ptr->keepalive_intv = U32_MAX; exit: spin_unlock_bh(&tn->node_list_lock); return n_ptr; } +static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) +{ + unsigned long tol = l->tolerance; + unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; + unsigned long keepalive_intv = msecs_to_jiffies(intv); + + /* Link with lowest tolerance determines timer interval */ + if (keepalive_intv < n->keepalive_intv) + n->keepalive_intv = keepalive_intv; + + /* Ensure link's abort limit corresponds to current interval */ + l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv); +} + static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); @@ -160,8 +182,11 @@ void tipc_node_stop(struct net *net) struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) + list_for_each_entry_safe(node, t_node, &tn->node_list, list) { + if (del_timer(&node->timer)) + tipc_node_put(node); tipc_node_put(node); + } spin_unlock_bh(&tn->node_list_lock); } @@ -219,131 +244,170 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) tipc_node_put(node); } +/* tipc_node_timeout - handle expiration of node timer + */ +static void tipc_node_timeout(unsigned long data) +{ + struct tipc_node *n = (struct tipc_node *)data; + struct sk_buff_head xmitq; + struct tipc_link *l; + struct tipc_media_addr *maddr; + int bearer_id; + int rc = 0; + + __skb_queue_head_init(&xmitq); + + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + tipc_node_lock(n); + l = n->links[bearer_id].link; + if (l) { + /* Link tolerance may change asynchronously: */ + tipc_node_calculate_timer(n, l); + rc = tipc_link_timeout(l, &xmitq); + if (rc & TIPC_LINK_DOWN_EVT) + tipc_link_reset(l); + } + tipc_node_unlock(n); + maddr = &n->links[bearer_id].maddr; + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + } + if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + tipc_node_get(n); + tipc_node_put(n); +} + /** * tipc_node_link_up - handle addition of link * * Link becomes active (alone or shared) or standby, depending on its priority. */ -void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +void tipc_node_link_up(struct tipc_node *n, int bearer_id) { - struct tipc_link **active = &n_ptr->active_links[0]; + int *slot0 = &n->active_links[0]; + int *slot1 = &n->active_links[1]; + struct tipc_link_entry *links = n->links; + struct tipc_link *l = n->links[bearer_id].link; - n_ptr->working_links++; - n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP; - n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id; + /* Leave room for tunnel header when returning 'mtu' to users: */ + links[bearer_id].mtu = l->mtu - INT_H_SIZE; + + n->working_links++; + n->action_flags |= TIPC_NOTIFY_LINK_UP; + n->link_id = l->peer_bearer_id << 16 | l->bearer_id; pr_debug("Established link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); + l->name, l->net_plane); - if (!active[0]) { - active[0] = active[1] = l_ptr; - node_established_contact(n_ptr); - goto exit; + /* No active links ? => take both active slots */ + if (*slot0 < 0) { + *slot0 = bearer_id; + *slot1 = bearer_id; + node_established_contact(n); + return; } - if (l_ptr->priority < active[0]->priority) { - pr_debug("New link <%s> becomes standby\n", l_ptr->name); - goto exit; + + /* Lower prio than current active ? => no slot */ + if (l->priority < links[*slot0].link->priority) { + pr_debug("New link <%s> becomes standby\n", l->name); + return; } - tipc_link_dup_queue_xmit(active[0], l_ptr); - if (l_ptr->priority == active[0]->priority) { - active[0] = l_ptr; - goto exit; + tipc_link_dup_queue_xmit(links[*slot0].link, l); + + /* Same prio as current active ? => take one slot */ + if (l->priority == links[*slot0].link->priority) { + *slot0 = bearer_id; + return; } - pr_debug("Old link <%s> becomes standby\n", active[0]->name); - if (active[1] != active[0]) - pr_debug("Old link <%s> becomes standby\n", active[1]->name); - active[0] = active[1] = l_ptr; -exit: - /* Leave room for changeover header when returning 'mtu' to users: */ - n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; + + /* Higher prio than current active => take both active slots */ + pr_debug("Old link <%s> now standby\n", links[*slot0].link->name); + *slot0 = bearer_id; + *slot1 = bearer_id; } /** - * node_select_active_links - select active link + * tipc_node_link_down - handle loss of link */ -static void node_select_active_links(struct tipc_node *n_ptr) +void tipc_node_link_down(struct tipc_node *n, int bearer_id) { - struct tipc_link **active = &n_ptr->active_links[0]; - u32 i; - u32 highest_prio = 0; + int *slot0 = &n->active_links[0]; + int *slot1 = &n->active_links[1]; + int i, highest = 0; + struct tipc_link *l, *_l; - active[0] = active[1] = NULL; + l = n->links[bearer_id].link; + n->working_links--; + n->action_flags |= TIPC_NOTIFY_LINK_DOWN; + n->link_id = l->peer_bearer_id << 16 | l->bearer_id; - for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i]; + pr_debug("Lost link <%s> on network plane %c\n", + l->name, l->net_plane); - if (!l_ptr || !tipc_link_is_up(l_ptr) || - (l_ptr->priority < highest_prio)) + /* Select new active link if any available */ + *slot0 = INVALID_BEARER_ID; + *slot1 = INVALID_BEARER_ID; + for (i = 0; i < MAX_BEARERS; i++) { + _l = n->links[i].link; + if (!_l || !tipc_link_is_up(_l)) + continue; + if (_l->priority < highest) + continue; + if (_l->priority > highest) { + highest = _l->priority; + *slot0 = i; + *slot1 = i; continue; - - if (l_ptr->priority > highest_prio) { - highest_prio = l_ptr->priority; - active[0] = active[1] = l_ptr; - } else { - active[1] = l_ptr; } + *slot1 = i; } + if (tipc_node_is_up(n)) + tipc_link_failover_send_queue(l); + else + node_lost_contact(n); } -/** - * tipc_node_link_down - handle loss of link - */ -void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +bool tipc_node_is_up(struct tipc_node *n) { - struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); - struct tipc_link **active; - - n_ptr->working_links--; - n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN; - n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id; - - if (!tipc_link_is_active(l_ptr)) { - pr_debug("Lost standby link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); - return; - } - pr_debug("Lost link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); - - active = &n_ptr->active_links[0]; - if (active[0] == l_ptr) - active[0] = active[1]; - if (active[1] == l_ptr) - active[1] = active[0]; - if (active[0] == l_ptr) - node_select_active_links(n_ptr); - if (tipc_node_is_up(n_ptr)) - tipc_link_failover_send_queue(l_ptr); - else - node_lost_contact(n_ptr); - - /* Leave room for changeover header when returning 'mtu' to users: */ - if (active[0]) { - n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; - return; - } - /* Loopback link went down? No fragmentation needed from now on. */ - if (n_ptr->addr == tn->own_addr) { - n_ptr->act_mtus[0] = MAX_MSG_SIZE; - n_ptr->act_mtus[1] = MAX_MSG_SIZE; - } + return n->active_links[0] != INVALID_BEARER_ID; } -int tipc_node_active_links(struct tipc_node *n_ptr) +void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *b, + bool *link_up, bool *addr_match, + struct tipc_media_addr *maddr) { - return n_ptr->active_links[0] != NULL; + struct tipc_link *l = n->links[b->identity].link; + struct tipc_media_addr *curr = &n->links[b->identity].maddr; + + *link_up = l && tipc_link_is_up(l); + *addr_match = l && !memcmp(curr, maddr, sizeof(*maddr)); } -int tipc_node_is_up(struct tipc_node *n_ptr) +bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, + struct tipc_media_addr *maddr) { - return tipc_node_active_links(n_ptr); + struct tipc_link *l = n->links[b->identity].link; + struct tipc_media_addr *curr = &n->links[b->identity].maddr; + struct sk_buff_head *inputq = &n->links[b->identity].inputq; + + if (!l) { + l = tipc_link_create(n, b, maddr, inputq, &n->bclink.namedq); + if (!l) + return false; + tipc_node_calculate_timer(n, l); + if (n->link_cnt == 1) { + if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + tipc_node_get(n); + } + } + memcpy(&l->media_addr, maddr, sizeof(*maddr)); + memcpy(curr, maddr, sizeof(*maddr)); + tipc_link_reset(l); + return true; } void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { - n_ptr->links[l_ptr->bearer_id] = l_ptr; + n_ptr->links[l_ptr->bearer_id].link = l_ptr; n_ptr->link_cnt++; } @@ -352,15 +416,151 @@ void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) int i; for (i = 0; i < MAX_BEARERS; i++) { - if (l_ptr != n_ptr->links[i]) + if (l_ptr != n_ptr->links[i].link) continue; - n_ptr->links[i] = NULL; + n_ptr->links[i].link = NULL; n_ptr->link_cnt--; } } +/* tipc_node_fsm_evt - node finite state machine + * Determines when contact is allowed with peer node + */ +static void tipc_node_fsm_evt(struct tipc_node *n, int evt) +{ + int state = n->state; + + switch (state) { + case SELF_DOWN_PEER_DOWN: + switch (evt) { + case SELF_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_COMING; + break; + case PEER_ESTABL_CONTACT_EVT: + state = SELF_COMING_PEER_UP; + break; + case SELF_LOST_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_UP_PEER_UP: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_DOWN_PEER_LEAVING: + switch (evt) { + case PEER_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case SELF_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_UP_PEER_COMING: + switch (evt) { + case PEER_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_UP; + break; + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_COMING_PEER_UP: + switch (evt) { + case SELF_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_UP; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case SELF_LOST_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_LEAVING_PEER_DOWN: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + default: + pr_err("Unknown node fsm state %x\n", state); + break; + } + + n->state = state; +} + +bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_link *l, + struct tipc_msg *hdr) +{ + int state = n->state; + + if (likely(state == SELF_UP_PEER_UP)) + return true; + + if (state == SELF_DOWN_PEER_DOWN) + return true; + + if (state == SELF_UP_PEER_COMING) { + /* If not traffic msg, peer may still be ESTABLISHING */ + if (tipc_link_is_up(l) && msg_is_traffic(hdr)) + tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); + return true; + } + + if (state == SELF_COMING_PEER_UP) + return true; + + if (state == SELF_LEAVING_PEER_DOWN) + return false; + + if (state == SELF_DOWN_PEER_LEAVING) { + if (msg_peer_is_up(hdr)) + return false; + tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + return true; + } + return false; +} + static void node_established_contact(struct tipc_node *n_ptr) { + tipc_node_fsm_evt(n_ptr, SELF_ESTABL_CONTACT_EVT); n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP; n_ptr->bclink.oos_state = 0; n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net); @@ -396,21 +596,18 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i]; + struct tipc_link *l_ptr = n_ptr->links[i].link; if (!l_ptr) continue; - l_ptr->flags &= ~LINK_FAILINGOVER; + l_ptr->exec_mode = TIPC_LINK_OPEN; l_ptr->failover_checkpt = 0; l_ptr->failover_pkts = 0; kfree_skb(l_ptr->failover_skb); l_ptr->failover_skb = NULL; tipc_link_reset_fragments(l_ptr); } - - n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; - /* Prevent re-contact with node until cleanup is done */ - n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN; + tipc_node_fsm_evt(n_ptr, SELF_LOST_CONTACT_EVT); /* Notify publications from this node */ n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN; @@ -453,7 +650,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, goto exit; tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (link) { strncpy(linkname, link->name, len); err = 0; @@ -559,6 +756,160 @@ msg_full: return -EMSGSIZE; } +static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel, + int *bearer_id, + struct tipc_media_addr **maddr) +{ + int id = n->active_links[sel & 1]; + + if (unlikely(id < 0)) + return NULL; + + *bearer_id = id; + *maddr = &n->links[id].maddr; + return n->links[id].link; +} + +/** + * tipc_node_xmit() is the general link level function for message sending + * @net: the applicable net namespace + * @list: chain of buffers containing message + * @dnode: address of destination node + * @selector: a number used for deterministic link selection + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + */ +int tipc_node_xmit(struct net *net, struct sk_buff_head *list, + u32 dnode, int selector) +{ + struct tipc_link *l = NULL; + struct tipc_node *n; + struct sk_buff_head xmitq; + struct tipc_media_addr *maddr; + int bearer_id; + int rc = -EHOSTUNREACH; + + __skb_queue_head_init(&xmitq); + n = tipc_node_find(net, dnode); + if (likely(n)) { + tipc_node_lock(n); + l = tipc_node_select_link(n, selector, &bearer_id, &maddr); + if (likely(l)) + rc = tipc_link_xmit(l, list, &xmitq); + if (unlikely(rc == -ENOBUFS)) + tipc_link_reset(l); + tipc_node_unlock(n); + tipc_node_put(n); + } + if (likely(!rc)) { + tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); + return 0; + } + if (likely(in_own_node(net, dnode))) { + tipc_sk_rcv(net, list); + return 0; + } + return rc; +} + +/* tipc_node_xmit_skb(): send single buffer to destination + * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE + * messages, which will not be rejected + * The only exception is datagram messages rerouted after secondary + * lookup, which are rare and safe to dispose of anyway. + * TODO: Return real return value, and let callers use + * tipc_wait_for_sendpkt() where applicable + */ +int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, + u32 selector) +{ + struct sk_buff_head head; + int rc; + + skb_queue_head_init(&head); + __skb_queue_tail(&head, skb); + rc = tipc_node_xmit(net, &head, dnode, selector); + if (rc == -ELINKCONG) + kfree_skb(skb); + return 0; +} + +/** + * tipc_rcv - process TIPC packets/messages arriving from off-node + * @net: the applicable net namespace + * @skb: TIPC packet + * @bearer: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ +void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) +{ + struct sk_buff_head xmitq; + struct tipc_node *n; + struct tipc_link *l; + struct tipc_msg *hdr; + struct tipc_media_addr *maddr; + int bearer_id = b->identity; + int rc = 0; + + __skb_queue_head_init(&xmitq); + + /* Ensure message is well-formed */ + if (unlikely(!tipc_msg_validate(skb))) + goto discard; + + /* Handle arrival of a non-unicast link packet */ + hdr = buf_msg(skb); + if (unlikely(msg_non_seq(hdr))) { + if (msg_user(hdr) == LINK_CONFIG) + tipc_disc_rcv(net, skb, b); + else + tipc_bclink_rcv(net, skb); + return; + } + + /* Locate neighboring node that sent packet */ + n = tipc_node_find(net, msg_prevnode(hdr)); + if (unlikely(!n)) + goto discard; + tipc_node_lock(n); + + /* Locate link endpoint that should handle packet */ + l = n->links[bearer_id].link; + if (unlikely(!l)) + goto unlock; + + /* Is reception of this packet permitted at the moment ? */ + if (unlikely(n->state != SELF_UP_PEER_UP)) + if (!tipc_node_filter_skb(n, l, hdr)) + goto unlock; + + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) + tipc_bclink_sync_state(n, hdr); + + /* Release acked broadcast messages */ + if (unlikely(n->bclink.acked != msg_bcast_ack(hdr))) + tipc_bclink_acknowledge(n, msg_bcast_ack(hdr)); + + /* Check protocol and update link state */ + rc = tipc_link_rcv(l, skb, &xmitq); + + if (unlikely(rc & TIPC_LINK_UP_EVT)) + tipc_link_activate(l); + if (unlikely(rc & TIPC_LINK_DOWN_EVT)) + tipc_link_reset(l); + skb = NULL; +unlock: + tipc_node_unlock(n); + tipc_sk_rcv(net, &n->links[bearer_id].inputq); + maddr = &n->links[bearer_id].maddr; + tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); + tipc_node_put(n); +discard: + kfree_skb(skb); +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/net/tipc/node.h b/net/tipc/node.h index 5a834cf..5e70168 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,6 +45,26 @@ /* Out-of-range value for node signature */ #define INVALID_NODE_SIG 0x10000 +#define INVALID_BEARER_ID -1 + +/* Node FSM states and events: + */ +enum { + SELF_DOWN_PEER_DOWN = 0xdd, + SELF_UP_PEER_UP = 0xaa, + SELF_DOWN_PEER_LEAVING = 0xd1, + SELF_UP_PEER_COMING = 0xac, + SELF_COMING_PEER_UP = 0xca, + SELF_LEAVING_PEER_DOWN = 0x1d, +}; + +enum { + SELF_ESTABL_CONTACT_EVT = 0xec, + SELF_LOST_CONTACT_EVT = 0x1c, + PEER_ESTABL_CONTACT_EVT = 0xfec, + PEER_LOST_CONTACT_EVT = 0xf1c +}; + /* Flags used to take different actions according to flag type * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down @@ -54,8 +74,6 @@ */ enum { TIPC_MSG_EVT = 1, - TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1), - TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_WAKEUP_BCAST_USERS = (1 << 5), @@ -85,10 +103,17 @@ struct tipc_node_bclink { u32 deferred_size; struct sk_buff_head deferdq; struct sk_buff *reasm_buf; - int inputq_map; + struct sk_buff_head namedq; bool recv_permitted; }; +struct tipc_link_entry { + struct tipc_link *link; + u32 mtu; + struct sk_buff_head inputq; + struct tipc_media_addr maddr; +}; + /** * struct tipc_node - TIPC node structure * @addr: network address of node @@ -98,9 +123,8 @@ struct tipc_node_bclink { * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages - * @curr_link: the link holding the node lock, if any - * @active_links: pointers to active links to node - * @links: pointers to all links to node + * @active_links: bearer ids of active links, used as index into links[] array + * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @bclink: broadcast-related info * @list: links to adjacent nodes in sorted list of cluster's nodes @@ -120,12 +144,12 @@ struct tipc_node { struct hlist_node hash; struct sk_buff_head *inputq; struct sk_buff_head *namedq; - struct tipc_link *active_links[2]; - u32 act_mtus[2]; - struct tipc_link *links[MAX_BEARERS]; + int active_links[2]; + struct tipc_link_entry links[MAX_BEARERS]; int action_flags; struct tipc_node_bclink bclink; struct list_head list; + int state; int link_cnt; u16 working_links; u16 capabilities; @@ -133,6 +157,8 @@ struct tipc_node { u32 link_id; struct list_head publ_list; struct list_head conn_sks; + unsigned long keepalive_intv; + struct timer_list timer; struct rcu_head rcu; }; @@ -140,18 +166,25 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr); void tipc_node_put(struct tipc_node *node); struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); +void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *bearer, + bool *link_up, bool *addr_match, + struct tipc_media_addr *maddr); +bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *bearer, + struct tipc_media_addr *maddr); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -int tipc_node_active_links(struct tipc_node *n_ptr); -int tipc_node_is_up(struct tipc_node *n_ptr); +void tipc_node_link_down(struct tipc_node *n_ptr, int bearer_id); +void tipc_node_link_up(struct tipc_node *n_ptr, int bearer_id); +bool tipc_node_is_up(struct tipc_node *n); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); void tipc_node_unlock(struct tipc_node *node); +int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, + int selector); +int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, + u32 selector); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); - int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); static inline void tipc_node_lock(struct tipc_node *node) @@ -159,26 +192,30 @@ static inline void tipc_node_lock(struct tipc_node *node) spin_lock_bh(&node->lock); } -static inline bool tipc_node_blocked(struct tipc_node *node) +static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) { - return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN | - TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); + int bearer_id = n->active_links[sel & 1]; + + if (unlikely(bearer_id == INVALID_BEARER_ID)) + return NULL; + + return n->links[bearer_id].link; } -static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) +static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) { - struct tipc_node *node; - u32 mtu; - - node = tipc_node_find(net, addr); + struct tipc_node *n; + int bearer_id; + unsigned int mtu = MAX_MSG_SIZE; - if (likely(node)) { - mtu = node->act_mtus[selector & 1]; - tipc_node_put(node); - } else { - mtu = MAX_MSG_SIZE; - } + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return mtu; + bearer_id = n->active_links[sel & 1]; + if (likely(bearer_id != INVALID_BEARER_ID)) + mtu = n->links[bearer_id].mtu; + tipc_node_put(n); return mtu; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3a7567f..5b0b08d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -261,7 +261,7 @@ static void tsk_rej_rx_queue(struct sock *sk) while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT)) - tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, 0); } } @@ -443,7 +443,7 @@ static int tipc_release(struct socket *sock) } if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, TIPC_ERR_NO_PORT)) - tipc_link_xmit_skb(net, skb, dnode, 0); + tipc_node_xmit_skb(net, skb, dnode, 0); } } @@ -456,7 +456,7 @@ static int tipc_release(struct socket *sock) tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, TIPC_ERR_NO_PORT); if (skb) - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); tipc_node_remove_conn(net, dnode, tsk->portid); } @@ -686,21 +686,22 @@ new_mtu: do { rc = tipc_bclink_xmit(net, pktchain); - if (likely(rc >= 0)) { - rc = dsz; - break; + if (likely(!rc)) + return dsz; + + if (rc == -ELINKCONG) { + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (!rc) + continue; } + __skb_queue_purge(pktchain); if (rc == -EMSGSIZE) { msg->msg_iter = save; goto new_mtu; } - if (rc != -ELINKCONG) - break; - tipc_sk(sk)->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); - } while (!rc); + break; + } while (1); return rc; } @@ -924,24 +925,25 @@ new_mtu: do { skb = skb_peek(pktchain); TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid); - if (likely(rc >= 0)) { + rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid); + if (likely(!rc)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; - rc = dsz; - break; + return dsz; } + if (rc == -ELINKCONG) { + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (!rc) + continue; + } + __skb_queue_purge(pktchain); if (rc == -EMSGSIZE) { m->msg_iter = save; goto new_mtu; } - if (rc != -ELINKCONG) - break; - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); - } while (!rc); + break; + } while (1); return rc; } @@ -1043,15 +1045,16 @@ next: return rc; do { if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_link_xmit(net, pktchain, dnode, portid); + rc = tipc_node_xmit(net, pktchain, dnode, portid); if (likely(!rc)) { tsk->sent_unacked++; sent += send; if (sent == dsz) - break; + return dsz; goto next; } if (rc == -EMSGSIZE) { + __skb_queue_purge(pktchain); tsk->max_pkt = tipc_node_get_mtu(net, dnode, portid); m->msg_iter = save; @@ -1059,13 +1062,13 @@ next: } if (rc != -ELINKCONG) break; + tsk->link_cong = 1; } rc = tipc_wait_for_sndpkt(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); } while (!rc); + __skb_queue_purge(pktchain); return sent ? sent : rc; } @@ -1221,7 +1224,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) return; msg = buf_msg(skb); msg_set_msgcnt(msg, ack); - tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg)); + tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) @@ -1700,7 +1703,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) return 0; } if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err)) - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); return 0; } @@ -1796,7 +1799,7 @@ int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err)) continue; xmit: - tipc_link_xmit_skb(net, skb, dnode, dport); + tipc_node_xmit_skb(net, skb, dnode, dport); } return err ? -EHOSTUNREACH : 0; } @@ -2089,7 +2092,7 @@ restart: } if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, TIPC_CONN_SHUTDOWN)) - tipc_link_xmit_skb(net, skb, dnode, + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); } else { dnode = tsk_peer_node(tsk); @@ -2099,7 +2102,7 @@ restart: 0, dnode, tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, TIPC_CONN_SHUTDOWN); - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); } tsk->connected = 0; sock->state = SS_DISCONNECTING; @@ -2161,7 +2164,7 @@ static void tipc_sk_timeout(unsigned long data) } bh_unlock_sock(sk); if (skb) - tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); + tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); exit: sock_put(sk); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index bd16c6c..0cebf1f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2048,7 +2048,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_audit_policy_delete(xp, 1, true); } else { // reset the timers here? - WARN(1, "Dont know what to do with soft policy expire\n"); + WARN(1, "Don't know what to do with soft policy expire\n"); } km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid); |