diff options
Diffstat (limited to 'net')
169 files changed, 3480 insertions, 3298 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index d2cd9de..69929c0 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -261,7 +261,6 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; - new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT); vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ad5e2fd1..055f0e9 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -799,6 +799,7 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; + dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index ae63cf7..5f1446c 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -184,12 +184,11 @@ int vlan_proc_add_dev(struct net_device *vlandev) /* * Delete directory entry for VLAN device. */ -int vlan_proc_rem_dev(struct net_device *vlandev) +void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; - return 0; } /****** Proc filesystem entry points ****************************************/ diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h index 063f60a..8838a2e 100644 --- a/net/8021q/vlanproc.h +++ b/net/8021q/vlanproc.h @@ -5,7 +5,7 @@ struct net; int vlan_proc_init(struct net *net); -int vlan_proc_rem_dev(struct net_device *vlandev); +void vlan_proc_rem_dev(struct net_device *vlandev); int vlan_proc_add_dev(struct net_device *vlandev); void vlan_proc_cleanup(struct net *net); @@ -14,7 +14,7 @@ void vlan_proc_cleanup(struct net *net); #define vlan_proc_init(net) (0) #define vlan_proc_cleanup(net) do {} while (0) #define vlan_proc_add_dev(dev) ({(void)(dev), 0; }) -#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; }) +#define vlan_proc_rem_dev(dev) do {} while (0) #endif #endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/Kconfig b/net/Kconfig index 1743546..b80efec 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -392,6 +392,10 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config DST_CACHE + bool "dst cache" + default n + endif # if NET # Used by archs to tell that they support BPF_JIT diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c6fc8f7..2dd40e5 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -12,7 +12,7 @@ config BATMAN_ADV B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The networks may be wired or wireless. See - http://www.open-mesh.org/ for more information and user space + https://www.open-mesh.org/ for more information and user space tools. config BATMAN_ADV_BLA diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 21434ab..207e2af 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 4e59cf3..a7485d6 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index df625de..bf0e7d6 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -31,6 +31,7 @@ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/list.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/printk.h> @@ -88,7 +89,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) * in the given ring buffer * @lq_recv: pointer to the ring buffer * - * Returns computed average value. + * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { @@ -132,7 +133,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * - * Returns 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, int max_if_num) @@ -180,7 +181,7 @@ unlock: * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed * - * Returns 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num) @@ -246,7 +247,7 @@ unlock: * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * - * Returns the originator object corresponding to the passed mac address or NULL + * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exists it is created an initialised. */ @@ -396,7 +397,14 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) return new_tq; } -/* is there another aggregated packet here? */ +/** + * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached + * @buff_pos: current position in the skb + * @packet_len: total length of the skb + * @tvlv_len: tvlv length of the previously considered OGM + * + * Return: true if there is enough space for another OGM, false otherwise. + */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, __be16 tvlv_len) { @@ -522,7 +530,7 @@ out: * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * - * Returns true if new_packet can be aggregated with forw_packet + * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, @@ -636,10 +644,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, unsigned char *skb_buff; unsigned int skb_size; - if (!atomic_inc_not_zero(&if_incoming->refcount)) + if (!kref_get_unless_zero(&if_incoming->refcount)) return; - if (!atomic_inc_not_zero(&if_outgoing->refcount)) + if (!kref_get_unless_zero(&if_outgoing->refcount)) goto out_free_incoming; /* own packet should always be scheduled */ @@ -995,7 +1003,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && - atomic_inc_not_zero(&tmp_neigh_node->refcount)) { + kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) batadv_neigh_node_free_ref(neigh_node); neigh_node = tmp_neigh_node; @@ -1125,7 +1133,7 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Returns 1 if the link can be considered bidirectional, 0 otherwise + * Return: 1 if the link can be considered bidirectional, 0 otherwise */ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, @@ -1154,7 +1162,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (tmp_neigh_node->if_incoming != if_incoming) continue; - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; @@ -1269,7 +1277,7 @@ out: * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Returns duplicate status as enum batadv_dup_status + * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, @@ -1308,7 +1316,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, - &orig_ifinfo->batman_seqno_reset)) { + BATADV_TQ_LOCAL_WINDOW_SIZE, + &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } @@ -1929,7 +1938,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * - * Returns a value less, equal to or greater than 0 if the metric via neigh1 is + * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, @@ -1970,7 +1979,7 @@ out: * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * - * Returns true if the metric via neigh1 is equally good or better than + * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 25cbc36..b56bb00 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -29,10 +29,16 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE); } -/* receive and process one packet within the sequence number window. +/** + * batadv_bit_get_packet - receive and process one packet within the sequence + * number window + * @priv: the bat priv with all the soft interface information + * @seq_bits: pointer to the sequence number receive packet + * @seq_num_diff: difference between the current/received sequence number and + * the last sequence number + * @set_mark: whether this packet should be marked in seq_bits * - * returns: - * 1 if the window was moved (either new or very old) + * Return: 1 if the window was moved (either new or very old), * 0 if the window was not moved/shifted. */ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 0226b22..3e41bb8 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -24,7 +24,14 @@ #include <linux/compiler.h> #include <linux/types.h> -/* Returns 1 if the corresponding bit in the given seq_bits indicates true +/** + * batadv_test_bit - check if bit is set in the current window + * + * @seq_bits: pointer to the sequence number receive packet + * @last_seqno: latest sequence number in seq_bits + * @curr_seqno: sequence number to test for + * + * Return: 1 if the corresponding bit in the given seq_bits indicates true * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, @@ -48,9 +55,6 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) set_bit(n, seq_bits); /* turn the position on */ } -/* receive and process one packet, returns 1 if received seq_num is considered - * new, 0 if old - */ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, int set_mark); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index c24c481..7781f39 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -31,6 +31,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -58,7 +59,13 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); -/* return the index of the claim */ +/** + * batadv_choose_claim - choose the right bucket for a claim. + * @data: data to hash + * @size: size of the hash table + * + * Return: the hash index of the claim + */ static inline u32 batadv_choose_claim(const void *data, u32 size) { struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; @@ -70,7 +77,13 @@ static inline u32 batadv_choose_claim(const void *data, u32 size) return hash % size; } -/* return the index of the backbone gateway */ +/** + * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway. + * @data: data to hash + * @size: size of the hash table + * + * Return: the hash index of the backbone gateway + */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; @@ -82,7 +95,13 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) return hash % size; } -/* compares address and vid of two backbone gws */ +/** + * batadv_compare_backbone_gw - compare address and vid of two backbone gws + * @node: list node of the first entry to compare + * @data2: pointer to the second backbone gateway + * + * Return: 1 if the backbones have the same data, 0 otherwise + */ static int batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { @@ -100,7 +119,13 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, return 1; } -/* compares address and vid of two claims */ +/** + * batadv_compare_backbone_gw - compare address and vid of two claims + * @node: list node of the first entry to compare + * @data2: pointer to the second claims + * + * Return: 1 if the claim have the same data, 0 otherwise + */ static int batadv_compare_claim(const struct hlist_node *node, const void *data2) { @@ -118,35 +143,63 @@ static int batadv_compare_claim(const struct hlist_node *node, return 1; } -/* free a backbone gw */ +/** + * batadv_backbone_gw_release - release backbone gw from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the backbone gw + */ +static void batadv_backbone_gw_release(struct kref *ref) +{ + struct batadv_bla_backbone_gw *backbone_gw; + + backbone_gw = container_of(ref, struct batadv_bla_backbone_gw, + refcount); + + kfree_rcu(backbone_gw, rcu); +} + +/** + * batadv_backbone_gw_free_ref - decrement the backbone gw refcounter and + * possibly release it + * @backbone_gw: backbone gateway to be free'd + */ static void batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) { - if (atomic_dec_and_test(&backbone_gw->refcount)) - kfree_rcu(backbone_gw, rcu); + kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } -/* finally deinitialize the claim */ -static void batadv_claim_release(struct batadv_bla_claim *claim) +/** + * batadv_claim_release - release claim from lists and queue for free after rcu + * grace period + * @ref: kref pointer of the claim + */ +static void batadv_claim_release(struct kref *ref) { + struct batadv_bla_claim *claim; + + claim = container_of(ref, struct batadv_bla_claim, refcount); + batadv_backbone_gw_free_ref(claim->backbone_gw); kfree_rcu(claim, rcu); } -/* free a claim, call claim_free_rcu if its the last reference */ +/** + * batadv_claim_free_ref - decrement the claim refcounter and possibly + * release it + * @claim: claim to be free'd + */ static void batadv_claim_free_ref(struct batadv_bla_claim *claim) { - if (atomic_dec_and_test(&claim->refcount)) - batadv_claim_release(claim); + kref_put(&claim->refcount, batadv_claim_release); } /** - * batadv_claim_hash_find + * batadv_claim_hash_find - looks for a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @data: search data (may be local/static data) * - * looks for a claim in the hash, and returns it if found - * or NULL otherwise. + * Return: claim if found or NULL otherwise. */ static struct batadv_bla_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, @@ -169,7 +222,7 @@ static struct batadv_bla_claim if (!batadv_compare_claim(&claim->hash_entry, data)) continue; - if (!atomic_inc_not_zero(&claim->refcount)) + if (!kref_get_unless_zero(&claim->refcount)) continue; claim_tmp = claim; @@ -181,12 +234,12 @@ static struct batadv_bla_claim } /** - * batadv_backbone_hash_find - looks for a claim in the hash + * batadv_backbone_hash_find - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @vid: the VLAN ID * - * Returns claim if found or NULL otherwise. + * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, @@ -213,7 +266,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, &search_entry)) continue; - if (!atomic_inc_not_zero(&backbone_gw->refcount)) + if (!kref_get_unless_zero(&backbone_gw->refcount)) continue; backbone_gw_tmp = backbone_gw; @@ -224,7 +277,10 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, return backbone_gw_tmp; } -/* delete all claims for a backbone */ +/** + * batadv_bla_del_backbone_claims - delete all claims for a backbone + * @backbone_gw: backbone gateway where the claims should be removed + */ static void batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { @@ -372,14 +428,13 @@ out: } /** - * batadv_bla_get_backbone_gw + * batadv_bla_get_backbone_gw - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local * - * searches for the backbone gw or creates a new one if it could not - * be found. + * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, @@ -412,7 +467,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, ether_addr_copy(entry->orig, orig); /* one for the hash, one for returning */ - atomic_set(&entry->refcount, 2); + kref_init(&entry->refcount); + kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, @@ -445,7 +501,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, return entry; } -/* update or add the own backbone gw to make sure we announce +/** + * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the selected primary interface + * @vid: VLAN identifier + * + * update or add the own backbone gw to make sure we announce * where we receive other backbone gws */ static void @@ -542,12 +604,9 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) } /** - * batadv_bla_send_announce + * batadv_bla_send_announce - Send an announcement frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: our backbone gateway which should be announced - * - * This function sends an announcement. It is called from multiple - * places. */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) @@ -595,7 +654,8 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, claim->lasttime = jiffies; claim->backbone_gw = backbone_gw; - atomic_set(&claim->refcount, 2); + kref_init(&claim->refcount); + kref_get(&claim->refcount); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", mac, BATADV_PRINT_VID(vid)); @@ -625,7 +685,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, batadv_backbone_gw_free_ref(claim->backbone_gw); } /* set (new) backbone gw */ - atomic_inc(&backbone_gw->refcount); + kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; spin_lock_bh(&backbone_gw->crc_lock); @@ -637,8 +697,11 @@ claim_free_ref: batadv_claim_free_ref(claim); } -/* Delete a claim from the claim hash which has the - * given mac address and vid. +/** + * batadv_bla_del_claim - delete a claim from the claim hash + * @bat_priv: the bat priv with all the soft interface information + * @mac: mac address of the claim to be removed + * @vid: VLAN id for the claim to be removed */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) @@ -666,7 +729,15 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_claim_free_ref(claim); } -/* check for ANNOUNCE frame, return 1 if handled */ +/** + * batadv_handle_announce - check for ANNOUNCE frame + * @bat_priv: the bat priv with all the soft interface information + * @an_addr: announcement mac address (ARP Sender HW address) + * @backbone_addr: originator address of the sender (Ethernet source MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { @@ -716,7 +787,16 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, return 1; } -/* check for REQUEST frame, return 1 if handled */ +/** + * batadv_handle_request - check for REQUEST frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: backbone address to be requested (ARP sender HW MAC) + * @ethhdr: ethernet header of a packet + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, struct ethhdr *ethhdr, @@ -740,7 +820,16 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, return 1; } -/* check for UNCLAIM frame, return 1 if handled */ +/** + * batadv_handle_unclaim - check for UNCLAIM frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: originator address of the backbone (Ethernet source) + * @claim_addr: Client to be unclaimed (ARP sender HW MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, u8 *claim_addr, @@ -769,7 +858,16 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, return 1; } -/* check for CLAIM frame, return 1 if handled */ +/** + * batadv_handle_claim - check for CLAIM frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: originator address of the backbone (Ethernet Source) + * @claim_addr: client mac address to be claimed (ARP sender HW MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, u8 *claim_addr, @@ -798,7 +896,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, } /** - * batadv_check_claim_group + * batadv_check_claim_group - check for claim group membership * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header @@ -809,7 +907,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, * This function also applies the group ID of the sender * if it is in the same mesh. * - * returns: + * Return: * 2 - if it is a claim packet and on the same group * 1 - if is a claim packet from another group * 0 - if it is not a claim packet @@ -873,14 +971,12 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, } /** - * batadv_bla_process_claim + * batadv_bla_process_claim - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked * - * Check if this is a claim frame, and process it accordingly. - * - * returns 1 if it was a claim frame, otherwise return 0 to + * Return: 1 if it was a claim frame, otherwise return 0 to * tell the callee that it can use the frame on its own. */ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, @@ -1011,7 +1107,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, return 1; } -/* Check when we last heard from other nodes, and remove them in case of +/** + * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or + * immediately + * @bat_priv: the bat priv with all the soft interface information + * @now: whether the whole hash shall be wiped now + * + * Check when we last heard from other nodes, and remove them in case of * a time out, or clean all backbone gws if now is set. */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) @@ -1059,7 +1161,7 @@ purge_now: } /** - * batadv_bla_purge_claims + * batadv_bla_purge_claims - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now @@ -1108,12 +1210,11 @@ purge_now: } /** - * batadv_bla_update_orig_address + * batadv_bla_update_orig_address - Update the backbone gateways when the own + * originator address changes * @bat_priv: the bat priv with all the soft interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL - * - * Update the backbone gateways when the own orig address changes. */ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, @@ -1184,7 +1285,11 @@ void batadv_bla_status_update(struct net_device *net_dev) batadv_hardif_free_ref(primary_if); } -/* periodic work to do: +/** + * batadv_bla_periodic_work - performs periodic bla work + * @work: kernel work struct + * + * periodic work to do: * * purge structures when they are too old * * send announcements */ @@ -1265,7 +1370,12 @@ out: static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; -/* initialize all bla structures */ +/** + * batadv_bla_init - initialize all bla structures + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success, < 0 on error. + */ int batadv_bla_init(struct batadv_priv *bat_priv) { int i; @@ -1320,7 +1430,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** - * batadv_bla_check_bcast_duplist + * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the bcast_packet to be checked * @@ -1332,6 +1442,8 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. + * + * Return: 1 if a packet is in the duplicate list, 0 otherwise. */ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -1390,14 +1502,13 @@ out: } /** - * batadv_bla_is_backbone_gw_orig + * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for + * the VLAN identified by vid. * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address * @vid: VLAN identifier * - * Check if the originator is a gateway for the VLAN identified by vid. - * - * Returns true if orig is a backbone for this vid, false otherwise. + * Return: true if orig is a backbone for this vid, false otherwise. */ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) @@ -1431,14 +1542,13 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, } /** - * batadv_bla_is_backbone_gw + * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN. * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * - * bla_is_backbone_gw inspects the skb for the VLAN ID and returns 1 - * if the orig_node is also a gateway on the soft interface, otherwise it - * returns 0. + * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise + * it returns 0. */ int batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) @@ -1465,7 +1575,12 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, return 1; } -/* free all bla structures (for softinterface free or module unload) */ +/** + * batadv_bla_init - free all bla structures + * @bat_priv: the bat priv with all the soft interface information + * + * for softinterface free or module unload + */ void batadv_bla_free(struct batadv_priv *bat_priv) { struct batadv_hard_iface *primary_if; @@ -1488,18 +1603,19 @@ void batadv_bla_free(struct batadv_priv *bat_priv) } /** - * batadv_bla_rx + * batadv_bla_rx - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @is_bcast: the packet came in a broadcast packet type. * - * bla_rx avoidance checks if: + * batadv_bla_rx avoidance checks if: * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function and - * returns 1, otherwise it returns 0 and the caller shall further + * in these cases, the skb is further handled by this function + * + * Return: 1 if handled, otherwise it returns 0 and the caller shall further * process the skb. */ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -1583,20 +1699,21 @@ out: } /** - * batadv_bla_tx + * batadv_bla_tx - check packets going into the mesh * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * - * bla_tx checks if: + * batadv_bla_tx checks if: * * a claim was received which has to be processed * * the frame is allowed on the mesh * - * in these cases, the skb is further handled by this function and - * returns 1, otherwise it returns 0 and the caller shall further - * process the skb. + * in these cases, the skb is further handled by this function. * * This call might reallocate skb data. + * + * Return: 1 if handled, otherwise it returns 0 and the caller shall further + * process the skb. */ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) @@ -1670,6 +1787,13 @@ out: return ret; } +/** + * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1719,6 +1843,14 @@ out: return 0; } +/** + * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq + * file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 7ea199b..579f0fa 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 037ad0a..48253cf 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -281,6 +281,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file) * originator table of an hard interface * @inode: inode pointer to debugfs file * @file: pointer to the seq_file + * + * Return: 0 on success or negative error number in case of failure */ static int batadv_originators_hardif_open(struct inode *inode, struct file *file) @@ -329,6 +331,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode, * batadv_dat_cache_open - Prepare file handler for reads from dat_chache * @inode: inode which was opened * @file: file handle to be initialized + * + * Return: 0 on success or negative error number in case of failure */ static int batadv_dat_cache_open(struct inode *inode, struct file *file) { @@ -483,6 +487,8 @@ void batadv_debugfs_destroy(void) * batadv_debugfs_add_hardif - creates the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which should be added. + * + * Return: 0 on success or negative error number in case of failure */ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 80ab8d6..1ab4e2e6 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index a49c705..e326111 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -30,6 +30,7 @@ #include <linux/in.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -62,21 +63,34 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) } /** + * batadv_dat_entry_release - release dat_entry from lists and queue for free + * after rcu grace period + * @ref: kref pointer of the dat_entry + */ +static void batadv_dat_entry_release(struct kref *ref) +{ + struct batadv_dat_entry *dat_entry; + + dat_entry = container_of(ref, struct batadv_dat_entry, refcount); + + kfree_rcu(dat_entry, rcu); +} + +/** * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly - * free it - * @dat_entry: the entry to free + * release it + * @dat_entry: dat_entry to be free'd */ static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry) { - if (atomic_dec_and_test(&dat_entry->refcount)) - kfree_rcu(dat_entry, rcu); + kref_put(&dat_entry->refcount, batadv_dat_entry_release); } /** * batadv_dat_to_purge - check whether a dat_entry has to be purged or not * @dat_entry: the entry to check * - * Returns true if the entry has to be purged now, false otherwise. + * Return: true if the entry has to be purged now, false otherwise. */ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) { @@ -151,7 +165,7 @@ static void batadv_dat_purge(struct work_struct *work) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entries are the same, 0 otherwise. + * Return: 1 if the two entries are the same, 0 otherwise. */ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) { @@ -166,7 +180,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_src field in the ARP packet. + * Return: the value of the hw_src field in the ARP packet. */ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { @@ -183,7 +197,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_src field in the ARP packet. + * Return: the value of the ip_src field in the ARP packet. */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { @@ -195,7 +209,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_dst field in the ARP packet. + * Return: the value of the hw_dst field in the ARP packet. */ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { @@ -207,7 +221,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_dst field in the ARP packet. + * Return: the value of the ip_dst field in the ARP packet. */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { @@ -219,7 +233,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data. + * Return: the selected index in the hash table for the given data. */ static u32 batadv_hash_dat(const void *data, u32 size) { @@ -256,7 +270,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) * @ip: search key * @vid: VLAN identifier * - * Returns the dat_entry if found, NULL otherwise. + * Return: the dat_entry if found, NULL otherwise. */ static struct batadv_dat_entry * batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, @@ -281,7 +295,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, if (dat_entry->ip != ip) continue; - if (!atomic_inc_not_zero(&dat_entry->refcount)) + if (!kref_get_unless_zero(&dat_entry->refcount)) continue; dat_entry_tmp = dat_entry; @@ -326,7 +340,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, dat_entry->vid = vid; ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; - atomic_set(&dat_entry->refcount, 2); + kref_init(&dat_entry->refcount); + kref_get(&dat_entry->refcount); hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, batadv_hash_dat, dat_entry, @@ -440,7 +455,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, * @candidate: orig_node under evaluation * @max_orig_node: last selected candidate * - * Returns true if the node has been elected as next candidate or false + * Return: true if the node has been elected as next candidate or false * otherwise. */ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, @@ -527,7 +542,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, max_orig_node)) continue; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) continue; max = tmp_max; @@ -558,7 +573,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * closest values (from the LEFT, with wrap around if needed) then the hash * value of the key. ip_dst is the key. * - * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM. + * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) @@ -602,7 +617,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) * This function copies the skb with pskb_copy() and is sent as unicast packet * to each of the selected candidates. * - * Returns true if the packet is sent to at least one candidate, false + * Return: true if the packet is sent to at least one candidate, false * otherwise. */ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, @@ -741,6 +756,8 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) /** * batadv_dat_init - initialise the DAT internals * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 in case of success, a negative error code otherwise */ int batadv_dat_init(struct batadv_priv *bat_priv) { @@ -779,6 +796,8 @@ void batadv_dat_free(struct batadv_priv *bat_priv) * batadv_dat_cache_seq_print_text - print the local DAT hash table * @seq: seq file to print on * @offset: not used + * + * Return: always 0 */ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) { @@ -831,7 +850,7 @@ out: * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb * - * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise. + * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ static u16 batadv_arp_get_type(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -904,8 +923,9 @@ out: * @skb: the buffer containing the packet to extract the VID from * @hdr_size: the size of the batman-adv header encapsulating the packet * - * If the packet embedded in the skb is vlan tagged this function returns the - * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned. + * Return: If the packet embedded in the skb is vlan tagged this function + * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS + * is returned. */ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) { @@ -930,7 +950,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * - * Returns true if the message has been sent to the dht candidates, false + * Return: true if the message has been sent to the dht candidates, false * otherwise. In case of a positive return value the message has to be enqueued * to permit the fallback. */ @@ -1020,7 +1040,7 @@ out: * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the request has been answered, false otherwise. + * Return: true if the request has been answered, false otherwise. */ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -1143,7 +1163,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the packet was snooped and consumed by DAT. False if the + * Return: true if the packet was snooped and consumed by DAT. False if the * packet has to be delivered to the interface */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, @@ -1200,7 +1220,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the broadcast packet * - * Returns true if the node can drop the packet, false otherwise. + * Return: true if the node can drop the packet, false otherwise. */ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 26d4a52..813ecea 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 20d9282..55656e8 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -85,7 +85,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, /** * batadv_frag_size_limit - maximum possible size of packet to be fragmented * - * Returns the maximum size of payload that can be fragmented. + * Return: the maximum size of payload that can be fragmented. */ static int batadv_frag_size_limit(void) { @@ -107,7 +107,7 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Returns true if chain is empty and caller can just insert the new fragment + * Return: true if chain is empty and caller can just insert the new fragment * without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, @@ -136,7 +136,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, * Insert a new fragment into the reverse ordered chain in the right table * entry. The hash table entry is cleared if "old" fragments exist in it. * - * Returns true if skb is buffered, false on error. If the chain has all the + * Return: true if skb is buffered, false on error. If the chain has all the * fragments needed to merge the packet, the chain is moved to the passed head * to avoid locking the chain in the table. */ @@ -242,12 +242,11 @@ err: /** * batadv_frag_merge_packets - merge a chain of fragments * @chain: head of chain with fragments - * @skb: packet with total size of skb after merging * * Expand the first skb in the chain and copy the content of the remaining * skb's into the expanded one. After doing so, clear the chain. * - * Returns the merged skb or NULL on error. + * Return: the merged skb or NULL on error. */ static struct sk_buff * batadv_frag_merge_packets(struct hlist_head *chain) @@ -307,6 +306,9 @@ free: * There are three possible outcomes: 1) Packet is merged: Return true and * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and leave skb as is. + * + * Return: true when packet is merged or buffered, false when skb is not not + * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, struct batadv_orig_node *orig_node_src) @@ -344,7 +346,7 @@ out_err: * will exceed the MTU towards the next-hop. If so, the fragment is forwarded * without merging it. * - * Returns true if the fragment is consumed/forwarded, false otherwise. + * Return: true if the fragment is consumed/forwarded, false otherwise. */ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_hard_iface *recv_if, @@ -399,7 +401,7 @@ out: * passed mtu and the old one with the rest. The new skb contains data from the * tail of the old skb. * - * Returns the new fragment, NULL on error. + * Return: the new fragment, NULL on error. */ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, struct batadv_frag_packet *frag_head, @@ -433,7 +435,7 @@ err: * @orig_node: final destination of the created fragments * @neigh_node: next-hop of the created fragments * - * Returns true on success, false otherwise. + * Return: true on success, false otherwise. */ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 8b9877e..9ff77c7 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -42,7 +42,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, * batadv_frag_check_entry - check if a list of fragments has timed out * @frags_entry: table entry to check * - * Returns true if the frags entry has timed out, false otherwise. + * Return: true if the frags entry has timed out, false otherwise. */ static inline bool batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index ccf70be..4b598bd 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -28,6 +28,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rculist.h> @@ -59,12 +60,29 @@ */ #define BATADV_DHCP_CHADDR_OFFSET 28 +/** + * batadv_gw_node_release - release gw_node from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the gw_node + */ +static void batadv_gw_node_release(struct kref *ref) +{ + struct batadv_gw_node *gw_node; + + gw_node = container_of(ref, struct batadv_gw_node, refcount); + + batadv_orig_node_free_ref(gw_node->orig_node); + kfree_rcu(gw_node, rcu); +} + +/** + * batadv_gw_node_free_ref - decrement the gw_node refcounter and possibly + * release it + * @gw_node: gateway node to free + */ static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node) { - if (atomic_dec_and_test(&gw_node->refcount)) { - batadv_orig_node_free_ref(gw_node->orig_node); - kfree_rcu(gw_node, rcu); - } + kref_put(&gw_node->refcount, batadv_gw_node_release); } static struct batadv_gw_node * @@ -77,7 +95,7 @@ batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) if (!gw_node) goto out; - if (!atomic_inc_not_zero(&gw_node->refcount)) + if (!kref_get_unless_zero(&gw_node->refcount)) gw_node = NULL; out: @@ -100,7 +118,7 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) if (!orig_node) goto unlock; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; unlock: @@ -118,7 +136,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->gw.list_lock); - if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount)) + if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount)) new_gw_node = NULL; curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); @@ -170,7 +188,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (!router_ifinfo) goto next; - if (!atomic_inc_not_zero(&gw_node->refcount)) + if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; @@ -188,7 +206,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (curr_gw) batadv_gw_node_free_ref(curr_gw); curr_gw = gw_node; - atomic_inc(&curr_gw->refcount); + kref_get(&curr_gw->refcount); } break; @@ -203,7 +221,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (curr_gw) batadv_gw_node_free_ref(curr_gw); curr_gw = gw_node; - atomic_inc(&curr_gw->refcount); + kref_get(&curr_gw->refcount); } break; } @@ -423,7 +441,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) return; gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); @@ -436,7 +454,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - atomic_set(&gw_node->refcount, 1); + kref_init(&gw_node->refcount); spin_lock_bh(&bat_priv->gw.list_lock); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); @@ -456,7 +474,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * - * Returns gateway node if found or NULL otherwise. + * Return: gateway node if found or NULL otherwise. */ static struct batadv_gw_node * batadv_gw_node_get(struct batadv_priv *bat_priv, @@ -469,7 +487,7 @@ batadv_gw_node_get(struct batadv_priv *bat_priv, if (gw_node_tmp->orig_node != orig_node) continue; - if (!atomic_inc_not_zero(&gw_node_tmp->refcount)) + if (!kref_get_unless_zero(&gw_node_tmp->refcount)) continue; gw_node = gw_node_tmp; @@ -656,13 +674,13 @@ out: * @chaddr: buffer where the client address will be stored. Valid * only if the function returns BATADV_DHCP_TO_CLIENT * - * Returns: + * This function may re-allocate the data buffer of the skb passed as argument. + * + * Return: * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error * while parsing it * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client - * - * This function may re-allocate the data buffer of the skb passed as argument. */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, @@ -777,11 +795,11 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, * server. Due to topology changes it may be the case that the GW server * previously selected is not the best one anymore. * - * Returns true if the packet destination is unicast and it is not the best gw, - * false otherwise. - * * This call might reallocate skb data. * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. + * + * Return: true if the packet destination is unicast and it is not the best gw, + * false otherwise. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index fa95277..582dd8c 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b51bfac..5ee04f7 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -38,7 +38,7 @@ * @description: text shown when throughput string cannot be parsed * @throughput: pointer holding the returned throughput information * - * Returns false on parse error and true otherwise. + * Return: false on parse error and true otherwise. */ static bool batadv_parse_throughput(struct net_device *net_dev, char *buff, const char *description, u32 *throughput) diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index ab893e3..b583463 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 57f71071..e2aaa4c 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,7 @@ #include "hard-interface.h" #include "main.h" +#include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> @@ -26,6 +27,7 @@ #include <linux/if_ether.h> #include <linux/if.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -47,13 +49,19 @@ #include "sysfs.h" #include "translation-table.h" -void batadv_hardif_free_rcu(struct rcu_head *rcu) +/** + * batadv_hardif_release - release hard interface from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the hard interface + */ +void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; - hard_iface = container_of(rcu, struct batadv_hard_iface, rcu); + hard_iface = container_of(ref, struct batadv_hard_iface, refcount); dev_put(hard_iface->net_dev); - kfree(hard_iface); + + kfree_rcu(hard_iface, rcu); } struct batadv_hard_iface * @@ -64,7 +72,7 @@ batadv_hardif_get_by_netdev(const struct net_device *net_dev) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && - atomic_inc_not_zero(&hard_iface->refcount)) + kref_get_unless_zero(&hard_iface->refcount)) goto out; } @@ -107,7 +115,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * This function recursively checks all the fathers of the device passed as * argument looking for a batman-adv soft interface. * - * Returns true if the device is descendant of a batman-adv mesh interface (or + * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) @@ -161,7 +169,7 @@ static int batadv_is_valid_iface(const struct net_device *net_dev) * interface * @net_device: the device to check * - * Returns true if the net device is a 802.11 wireless device, false otherwise. + * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_netdev(struct net_device *net_device) { @@ -194,7 +202,7 @@ batadv_hardif_get_active(const struct net_device *soft_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && - atomic_inc_not_zero(&hard_iface->refcount)) + kref_get_unless_zero(&hard_iface->refcount)) goto out; } @@ -228,7 +236,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, ASSERT_RTNL(); - if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount)) + if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount)) new_hard_iface = NULL; curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); @@ -426,7 +434,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * * Invoke ndo_del_slave on master passing slave as argument. In this way slave * is free'd and master can correctly change its internal state. - * Return 0 on success, a negative value representing the error otherwise + * + * Return: 0 on success, a negative value representing the error otherwise */ static int batadv_master_del_slave(struct batadv_hard_iface *slave, struct net_device *master) @@ -455,7 +464,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) goto out; soft_iface = dev_get_by_name(&init_net, iface_name); @@ -676,7 +685,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; /* extra reference for return */ - atomic_set(&hard_iface->refcount, 2); + kref_init(&hard_iface->refcount); + kref_get(&hard_iface->refcount); batadv_check_known_mac_addr(hard_iface->net_dev); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 7b12ea8..5cecc6b 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -20,8 +20,8 @@ #include "main.h" -#include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/kref.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> @@ -61,18 +61,16 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, void batadv_hardif_remove_interfaces(void); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); -void batadv_hardif_free_rcu(struct rcu_head *rcu); +void batadv_hardif_release(struct kref *ref); /** - * batadv_hardif_free_ref - decrement the hard interface refcounter and - * possibly free it + * batadv_hardif_free_ref - decrement the hard interface refcounter and possibly + * release it * @hard_iface: the hard interface to free */ -static inline void -batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) +static inline void batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) { - if (atomic_dec_and_test(&hard_iface->refcount)) - call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); + kref_put(&hard_iface->refcount, batadv_hardif_release); } static inline struct batadv_hard_iface * @@ -85,7 +83,7 @@ batadv_primary_if_get_selected(struct batadv_priv *bat_priv) if (!hard_iface) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) hard_iface = NULL; out: diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 2ea6a18..a0a0fdb 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 3776262..9bb57b8 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -30,14 +30,17 @@ struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their - * keys, return 0 if same and not 0 if not same + * keys + * + * Return: 0 if same and not 0 if not same */ typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, const void *); -/* the hashfunction, should return an index - * based on the key in the data of the first - * argument and the size the second +/* the hashfunction + * + * Return: an index based on the key in the data of the first argument and the + * size the second */ typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); @@ -96,7 +99,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, * @data: data passed to the aforementioned callbacks as argument * @data_node: to be added element * - * Returns 0 on success, 1 if the element already is in the hash + * Return: 0 on success, 1 if the element already is in the hash * and -1 on error. */ static inline int batadv_hash_add(struct batadv_hashtable *hash, @@ -139,10 +142,11 @@ out: return ret; } -/* removes data from hash, if found. returns pointer do data on success, so you - * can remove the used structure yourself, or NULL on error . data could be the - * structure you use with just the key filled, we just need the key for - * comparing. +/* removes data from hash, if found. data could be the structure you use with + * just the key filled, we just need the key for comparing. + * + * Return: returns pointer do data on success, so you can remove the used + * structure yourself, or NULL on error */ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index bcabb5e..a69da37 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index e937143..618d5de 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4b5d61f..568c550 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -29,6 +29,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/module.h> @@ -233,7 +234,7 @@ void batadv_mesh_free(struct net_device *soft_iface) * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check * - * Returns 'true' if the mac address was found, false otherwise. + * Return: 'true' if the mac address was found, false otherwise. */ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { @@ -262,7 +263,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) * function that requires the primary interface * @seq: debugfs table seq_file struct * - * Returns primary interface if found or NULL otherwise. + * Return: primary interface if found or NULL otherwise. */ struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq) @@ -297,7 +298,7 @@ out: * batadv_max_header_len - calculate maximum encapsulation overhead for a * payload packet * - * Return the maximum encapsulation overhead in bytes. + * Return: the maximum encapsulation overhead in bytes. */ int batadv_max_header_len(void) { @@ -599,6 +600,8 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. + * + * Return: big endian crc32c of the checksummed data */ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { @@ -622,15 +625,27 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) } /** - * batadv_tvlv_handler_free_ref - decrement the tvlv handler refcounter and - * possibly free it + * batadv_tvlv_handler_release - release tvlv handler from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_handler_release(struct kref *ref) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); + kfree_rcu(tvlv_handler, rcu); +} + +/** + * batadv_tvlv_handler_free_ref - decrement the tvlv container refcounter and + * possibly release it * @tvlv_handler: the tvlv handler to free */ static void batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) { - if (atomic_dec_and_test(&tvlv_handler->refcount)) - kfree_rcu(tvlv_handler, rcu); + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } /** @@ -640,7 +655,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) * @type: tvlv handler type to look for * @version: tvlv handler version to look for * - * Returns tvlv handler if found or NULL otherwise. + * Return: tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler *batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) @@ -656,7 +671,7 @@ static struct batadv_tvlv_handler if (tvlv_handler_tmp->version != version) continue; - if (!atomic_inc_not_zero(&tvlv_handler_tmp->refcount)) + if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) continue; tvlv_handler = tvlv_handler_tmp; @@ -668,14 +683,25 @@ static struct batadv_tvlv_handler } /** + * batadv_tvlv_container_release - release tvlv from lists and free + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_container_release(struct kref *ref) +{ + struct batadv_tvlv_container *tvlv; + + tvlv = container_of(ref, struct batadv_tvlv_container, refcount); + kfree(tvlv); +} + +/** * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and - * possibly free it + * possibly release it * @tvlv: the tvlv container to free */ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) { - if (atomic_dec_and_test(&tvlv->refcount)) - kfree(tvlv); + kref_put(&tvlv->refcount, batadv_tvlv_container_release); } /** @@ -688,13 +714,15 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * - * Returns tvlv container if found or NULL otherwise. + * Return: tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container *batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { if (tvlv_tmp->tvlv_hdr.type != type) continue; @@ -702,7 +730,7 @@ static struct batadv_tvlv_container if (tvlv_tmp->tvlv_hdr.version != version) continue; - if (!atomic_inc_not_zero(&tvlv_tmp->refcount)) + if (!kref_get_unless_zero(&tvlv_tmp->refcount)) continue; tvlv = tvlv_tmp; @@ -720,13 +748,15 @@ static struct batadv_tvlv_container * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * - * Returns size of all currently registered tvlv containers in bytes. + * Return: size of all currently registered tvlv containers in bytes. */ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; u16 tvlv_len = 0; + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); tvlv_len += ntohs(tvlv->tvlv_hdr.len); @@ -808,7 +838,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); INIT_HLIST_NODE(&tvlv_new->list); - atomic_set(&tvlv_new->refcount, 1); + kref_init(&tvlv_new->refcount); spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); @@ -826,7 +856,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, * @additional_packet_len: requested additional packet size on top of minimum * size * - * Returns true of the packet buffer could be changed to the requested size, + * Return: true of the packet buffer could be changed to the requested size, * false otherwise. */ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, @@ -862,7 +892,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * - * Returns size of all appended tvlv containers in bytes. + * Return: size of all appended tvlv containers in bytes. */ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, @@ -915,7 +945,7 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Returns success if handler was not found or the return value of the handler + * Return: success if handler was not found or the return value of the handler * callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, @@ -968,7 +998,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Returns success when processing an OGM or the return value of all called + * Return: success when processing an OGM or the return value of all called * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, @@ -1094,7 +1124,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; - atomic_set(&tvlv_handler->refcount, 1); + kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); @@ -1190,8 +1220,8 @@ out: * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * - * If the packet embedded in the skb is vlan tagged this function returns the - * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned. + * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the + * skb is vlan tagged. Otherwise BATADV_NO_FLAGS. */ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) { @@ -1218,7 +1248,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Returns true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN idenfied by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 9dbd910..32dfc9e 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -35,6 +35,9 @@ /* Time To Live of broadcast messages */ #define BATADV_TTL 50 +/* maximum sequence number age of broadcast messages */ +#define BATADV_BCAST_MAX_AGE 64 + /* purge originators after time in seconds if no valid packet comes in * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ @@ -273,9 +276,14 @@ static inline void _batadv_dbg(int type __always_unused, pr_err("%s: " fmt, _netdev->name, ## arg); \ } while (0) -/* returns 1 if they are the same ethernet addr +/** + * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses + * @data1: Pointer to a six-byte array containing the Ethernet address + * @data2: Pointer other six-byte array containing the Ethernet address * * note: can't use ether_addr_equal() as it requires aligned memory + * + * Return: 1 if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { @@ -287,7 +295,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2) * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * - * Returns true if current time is after timestamp + timeout + * Return: true if current time is after timestamp + timeout */ static inline bool batadv_has_timed_out(unsigned long timestamp, unsigned int timeout) @@ -326,7 +334,13 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) -/* Sum and return the cpu-local counters for index 'idx' */ +/** + * batadv_sum_counter - Sum the cpu-local counters for index 'idx' + * @bat_priv: the bat priv with all the soft interface information + * @idx: index of counter to sum up + * + * Return: sum of all cpu-local counters + */ static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 75fa501..8caa2c7 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -30,6 +30,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -55,7 +56,7 @@ * Collect multicast addresses of the local multicast listeners * on the given soft interface, dev, in the given mcast_list. * - * Returns -ENOMEM on memory allocation error or the number of + * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get(struct net_device *dev, @@ -87,7 +88,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * - * Returns true if the given address is already in the given list. + * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, @@ -195,8 +196,9 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, * batadv_mcast_has_bridge - check whether the soft-iface is bridged * @bat_priv: the bat priv with all the soft interface information * - * Checks whether there is a bridge on top of our soft interface. Returns - * true if so, false otherwise. + * Checks whether there is a bridge on top of our soft interface. + * + * Return: true if there is a bridge, false otherwise. */ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) { @@ -218,7 +220,7 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. * - * Returns true if the tvlv container is registered afterwards. Otherwise + * Return: true if the tvlv container is registered afterwards. Otherwise * returns false. */ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) @@ -289,8 +291,8 @@ out: * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of - * memory allocation failure. + * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory + * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -327,8 +329,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out - * of memory. + * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -366,8 +367,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out - * of memory. + * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -398,7 +398,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * - * Returns the number of nodes which want all IPv4 multicast traffic if the + * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ @@ -421,7 +421,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination * - * Returns an orig_node matching the multicast address provided by ethhdr + * Return: an orig_node matching the multicast address provided by ethhdr * via a translation table lookup. This increases the returned nodes refcount. */ static struct batadv_orig_node * @@ -436,7 +436,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and * increases its refcount. */ static struct batadv_orig_node * @@ -448,7 +448,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -463,7 +463,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set * and increases its refcount. */ static struct batadv_orig_node * @@ -475,7 +475,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -491,7 +491,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and * increases its refcount. */ @@ -514,7 +514,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag * set and increases its refcount. */ static struct batadv_orig_node * @@ -526,7 +526,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_unsnoopables_list, mcast_want_all_unsnoopables_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -543,7 +543,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) * @skb: The multicast packet to check * @orig: an originator to be set to forward the skb to * - * Returns the forwarding mode as enum batadv_forw_mode and in case of + * Return: the forwarding mode as enum batadv_forw_mode and in case of * BATADV_FORW_SINGLE set the orig to the single originator the skb * should be forwarded to. */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 8f3cb04..80bceec 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -23,7 +23,7 @@ struct sk_buff; /** - * batadv_forw_mode - the way a packet should be forwarded as + * enum batadv_forw_mode - the way a packet should be forwarded as * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic * flooding) * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index cc63b44..a4eb8ee 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -32,6 +32,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -64,6 +65,8 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, /** * batadv_nc_init - one-time initialization for network coding + * + * Return: 0 on success or negative error number in case of failure */ int __init batadv_nc_init(void) { @@ -142,6 +145,8 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_nc_mesh_init - initialise coding hash table and start house keeping * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure */ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) { @@ -205,34 +210,50 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) /** * batadv_nc_node_release - release nc_node from lists and queue for free after * rcu grace period - * @nc_node: the nc node to free + * @ref: kref pointer of the nc_node */ -static void batadv_nc_node_release(struct batadv_nc_node *nc_node) +static void batadv_nc_node_release(struct kref *ref) { + struct batadv_nc_node *nc_node; + + nc_node = container_of(ref, struct batadv_nc_node, refcount); + batadv_orig_node_free_ref(nc_node->orig_node); kfree_rcu(nc_node, rcu); } /** - * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly + * batadv_nc_node_free_ref - decrement the nc_node refcounter and possibly * release it - * @nc_node: the nc node to free + * @nc_node: nc_node to be free'd */ static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) { - if (atomic_dec_and_test(&nc_node->refcount)) - batadv_nc_node_release(nc_node); + kref_put(&nc_node->refcount, batadv_nc_node_release); } /** - * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly - * frees it - * @nc_path: the nc node to free + * batadv_nc_path_release - release nc_path from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the nc_path + */ +static void batadv_nc_path_release(struct kref *ref) +{ + struct batadv_nc_path *nc_path; + + nc_path = container_of(ref, struct batadv_nc_path, refcount); + + kfree_rcu(nc_path, rcu); +} + +/** + * batadv_nc_path_free_ref - decrement the nc_path refcounter and possibly + * release it + * @nc_path: nc_path to be free'd */ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) { - if (atomic_dec_and_test(&nc_path->refcount)) - kfree_rcu(nc_path, rcu); + kref_put(&nc_path->refcount, batadv_nc_path_release); } /** @@ -251,7 +272,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) * @bat_priv: the bat priv with all the soft interface information * @nc_node: the nc node to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, struct batadv_nc_node *nc_node) @@ -267,7 +288,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) @@ -287,7 +308,7 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) @@ -470,7 +491,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data. + * Return: the selected index in the hash table for the given data. */ static u32 batadv_nc_hash_choose(const void *data, u32 size) { @@ -489,7 +510,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entry are the same, 0 otherwise + * Return: 1 if the two entry are the same, 0 otherwise */ static int batadv_nc_hash_compare(const struct hlist_node *node, const void *data2) @@ -516,7 +537,7 @@ static int batadv_nc_hash_compare(const struct hlist_node *node, * @hash: hash table containing the nc path * @data: search key * - * Returns the nc_path if found, NULL otherwise. + * Return: the nc_path if found, NULL otherwise. */ static struct batadv_nc_path * batadv_nc_hash_find(struct batadv_hashtable *hash, @@ -537,7 +558,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash, if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) continue; - if (!atomic_inc_not_zero(&nc_path->refcount)) + if (!kref_get_unless_zero(&nc_path->refcount)) continue; nc_path_tmp = nc_path; @@ -571,7 +592,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) * timeout. If so, the packet is no longer kept and the entry deleted from the * queue. Has to be called with the appropriate locks. * - * Returns false as soon as the entry in the fifo queue has not been timed out + * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, @@ -610,7 +631,7 @@ out: * packet is no longer delayed, immediately sent and the entry deleted from the * queue. Has to be called with the appropriate locks. * - * Returns false as soon as the entry in the fifo queue has not been timed out + * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, @@ -731,7 +752,7 @@ static void batadv_nc_worker(struct work_struct *work) * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks * - * Returns true if: + * Return: true if: * 1) The OGM must have the most recent sequence number. * 2) The TTL must be decremented by one and only one. * 3) The OGM must be received from the first hop from orig_node. @@ -772,7 +793,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * - * Returns the nc_node if found, NULL otherwise. + * Return: the nc_node if found, NULL otherwise. */ static struct batadv_nc_node *batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, @@ -793,7 +814,7 @@ static struct batadv_nc_node if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) continue; - if (!atomic_inc_not_zero(&nc_node->refcount)) + if (!kref_get_unless_zero(&nc_node->refcount)) continue; /* Found a match */ @@ -814,7 +835,7 @@ static struct batadv_nc_node * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * - * Returns the nc_node if found or created, NULL in case of an error. + * Return: the nc_node if found or created, NULL in case of an error. */ static struct batadv_nc_node *batadv_nc_get_nc_node(struct batadv_priv *bat_priv, @@ -837,14 +858,15 @@ static struct batadv_nc_node if (!nc_node) return NULL; - if (!atomic_inc_not_zero(&orig_neigh_node->refcount)) + if (!kref_get_unless_zero(&orig_neigh_node->refcount)) goto free; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); ether_addr_copy(nc_node->addr, orig_node->orig); nc_node->orig_node = orig_neigh_node; - atomic_set(&nc_node->refcount, 2); + kref_init(&nc_node->refcount); + kref_get(&nc_node->refcount); /* Select ingoing or outgoing coding node */ if (in_coding) { @@ -932,7 +954,7 @@ out: * @src: ethernet source address - first half of the nc path search key * @dst: ethernet destination address - second half of the nc path search key * - * Returns pointer to nc_path if the path was found or created, returns NULL + * Return: pointer to nc_path if the path was found or created, returns NULL * on error. */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, @@ -963,7 +985,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, /* Initialize nc_path */ INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); - atomic_set(&nc_path->refcount, 2); + kref_init(&nc_path->refcount); + kref_get(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); @@ -989,6 +1012,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value + * + * Return: scaled tq value */ static u8 batadv_nc_random_weight_tq(u8 tq) { @@ -1029,7 +1054,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) * @nc_packet: structure containing the packet to the skb can be coded with * @neigh_node: next hop to forward packet to * - * Returns true if both packets are consumed, false otherwise. + * Return: true if both packets are consumed, false otherwise. */ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -1228,7 +1253,7 @@ out: * Since the source encoded the packet we can be certain it has all necessary * decode information. * - * Returns true if coding of a decoded packet is allowed. + * Return: true if coding of a decoded packet is allowed. */ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { @@ -1246,7 +1271,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) * @skb: data skb to forward * @eth_dst: next hop mac address of skb * - * Returns true if coding of a decoded skb is allowed. + * Return: true if coding of a decoded skb is allowed. */ static struct batadv_nc_packet * batadv_nc_path_search(struct batadv_priv *bat_priv, @@ -1314,7 +1339,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, * @eth_src: source mac address of skb * @in_nc_node: pointer to skb next hop's neighbor nc node * - * Returns an nc packet if a suitable coding packet was found, NULL otherwise. + * Return: an nc packet if a suitable coding packet was found, NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, @@ -1397,7 +1422,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * next hop that potentially sent a packet which our next hop also received * (overheard) and has stored for later decoding. * - * Returns true if the skb was consumed (encoded packet sent) or false otherwise + * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, struct batadv_neigh_node *neigh_node, @@ -1451,7 +1476,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, * @neigh_node: next hop to forward packet to * @packet_id: checksum to identify packet * - * Returns true if the packet was buffered or false in case of an error. + * Return: true if the packet was buffered or false in case of an error. */ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, struct batadv_nc_path *nc_path, @@ -1485,7 +1510,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, * @skb: data skb to forward * @neigh_node: next hop to forward packet to * - * Returns true if the skb was consumed (encoded packet sent) or false otherwise + * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ bool batadv_nc_skb_forward(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) @@ -1624,7 +1649,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, * @skb: unicast skb to decode * @nc_packet: decode data needed to decode the skb * - * Returns pointer to decoded unicast packet if the packet was decoded or NULL + * Return: pointer to decoded unicast packet if the packet was decoded or NULL * in case of an error. */ static struct batadv_unicast_packet * @@ -1718,7 +1743,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, * @ethhdr: pointer to the ethernet header inside the coded packet * @coded: coded packet we try to find decode data for * - * Returns pointer to nc packet if the needed data was found or NULL otherwise. + * Return: pointer to nc packet if the needed data was found or NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, @@ -1781,6 +1806,9 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, * resulting unicast packet * @skb: incoming coded packet * @recv_if: pointer to interface this packet was received on + * + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * otherwise. */ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) @@ -1865,6 +1893,8 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) * batadv_nc_nodes_seq_print_text - print the nc node information * @seq: seq file to print on * @offset: not used + * + * Return: always 0 */ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) { @@ -1927,6 +1957,8 @@ out: /** * batadv_nc_init_debugfs - create nc folder and related files in debugfs * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure */ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 8f6d4ad..d6d7fb4 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index fe578f7..eacd0e5 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,11 +18,13 @@ #include "originator.h" #include "main.h" +#include <linux/atomic.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -47,7 +49,13 @@ static struct lock_class_key batadv_orig_hash_lock_class_key; static void batadv_purge_orig(struct work_struct *work); -/* returns 1 if they are the same originator */ +/** + * batadv_compare_orig - comparing function used in the originator hash table + * @node: node in the local table + * @data2: second object to compare the node to + * + * Return: 1 if they are the same originator + */ int batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, @@ -61,7 +69,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2) * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * - * Returns the vlan object identified by vid and belonging to orig_node or NULL + * Return: the vlan object identified by vid and belonging to orig_node or NULL * if it does not exist. */ struct batadv_orig_node_vlan * @@ -75,7 +83,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, if (tmp->vid != vid) continue; - if (!atomic_inc_not_zero(&tmp->refcount)) + if (!kref_get_unless_zero(&tmp->refcount)) continue; vlan = tmp; @@ -93,7 +101,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * - * Returns NULL in case of failure or the vlan object identified by vid and + * Return: NULL in case of failure or the vlan object identified by vid and * belonging to orig_node otherwise. The object is created and added to the list * if it does not exist. * @@ -116,7 +124,8 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, if (!vlan) goto out; - atomic_set(&vlan->refcount, 2); + kref_init(&vlan->refcount); + kref_get(&vlan->refcount); vlan->vid = vid; hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); @@ -128,14 +137,27 @@ out: } /** - * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly free - * the originator-vlan object + * batadv_orig_node_vlan_release - release originator-vlan object from lists + * and queue for free after rcu grace period + * @ref: kref pointer of the originator-vlan object + */ +static void batadv_orig_node_vlan_release(struct kref *ref) +{ + struct batadv_orig_node_vlan *orig_vlan; + + orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount); + + kfree_rcu(orig_vlan, rcu); +} + +/** + * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly + * release the originator-vlan object * @orig_vlan: the originator-vlan object to release */ void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan) { - if (atomic_dec_and_test(&orig_vlan->refcount)) - kfree_rcu(orig_vlan, rcu); + kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } int batadv_originator_init(struct batadv_priv *bat_priv) @@ -165,11 +187,14 @@ err: /** * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for * free after rcu grace period - * @neigh_ifinfo: the neigh_ifinfo object to release + * @ref: kref pointer of the neigh_ifinfo */ -static void -batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo) +static void batadv_neigh_ifinfo_release(struct kref *ref) { + struct batadv_neigh_ifinfo *neigh_ifinfo; + + neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount); + if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_free_ref(neigh_ifinfo->if_outgoing); @@ -183,18 +208,21 @@ batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo) */ void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) { - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - batadv_neigh_ifinfo_release(neigh_ifinfo); + kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** * batadv_hardif_neigh_release - release hardif neigh node from lists and * queue for free after rcu grace period - * @hardif_neigh: hardif neigh neighbor to free + * @ref: kref pointer of the neigh_node */ -static void -batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh) +static void batadv_hardif_neigh_release(struct kref *ref) { + struct batadv_hardif_neigh_node *hardif_neigh; + + hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node, + refcount); + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); @@ -210,22 +238,23 @@ batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh) */ void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) - batadv_hardif_neigh_release(hardif_neigh); + kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** * batadv_neigh_node_release - release neigh_node from lists and queue for * free after rcu grace period - * @neigh_node: neigh neighbor to free + * @ref: kref pointer of the neigh_node */ -static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node) +static void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; + struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; + neigh_node = container_of(ref, struct batadv_neigh_node, refcount); bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, @@ -250,14 +279,13 @@ static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node) } /** - * batadv_neigh_node_free_ref - decrement the neighbors refcounter - * and possibly release it + * batadv_neigh_node_free_ref - decrement the neighbors refcounter and possibly + * release it * @neigh_node: neigh neighbor to free */ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) { - if (atomic_dec_and_test(&neigh_node->refcount)) - batadv_neigh_node_release(neigh_node); + kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** @@ -266,7 +294,7 @@ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Returns the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -286,7 +314,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, break; } - if (router && !atomic_inc_not_zero(&router->refcount)) + if (router && !kref_get_unless_zero(&router->refcount)) router = NULL; rcu_read_unlock(); @@ -298,7 +326,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns the requested orig_ifinfo or NULL if not found. + * Return: the requested orig_ifinfo or NULL if not found. * * The object is returned with refcounter increased by 1. */ @@ -314,7 +342,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, if (tmp->if_outgoing != if_outgoing) continue; - if (!atomic_inc_not_zero(&tmp->refcount)) + if (!kref_get_unless_zero(&tmp->refcount)) continue; orig_ifinfo = tmp; @@ -330,7 +358,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing + * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing * interface otherwise. The object is created and added to the list * if it does not exist. * @@ -354,7 +382,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, goto out; if (if_outgoing != BATADV_IF_DEFAULT && - !atomic_inc_not_zero(&if_outgoing->refcount)) { + !kref_get_unless_zero(&if_outgoing->refcount)) { kfree(orig_ifinfo); orig_ifinfo = NULL; goto out; @@ -365,7 +393,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, orig_ifinfo->batman_seqno_reset = reset_time; orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); - atomic_set(&orig_ifinfo->refcount, 2); + kref_init(&orig_ifinfo->refcount); + kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); out: @@ -375,12 +404,12 @@ out: /** * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node - * @neigh_node: the neigh node to be queried + * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * The object is returned with refcounter increased by 1. * - * Returns the requested neigh_ifinfo or NULL if not found + * Return: the requested neigh_ifinfo or NULL if not found */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, @@ -395,7 +424,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, if (tmp_neigh_ifinfo->if_outgoing != if_outgoing) continue; - if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount)) continue; neigh_ifinfo = tmp_neigh_ifinfo; @@ -408,10 +437,10 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, /** * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object - * @neigh_node: the neigh node to be queried + * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns NULL in case of failure or the neigh_ifinfo object for the + * Return: NULL in case of failure or the neigh_ifinfo object for the * if_outgoing interface otherwise. The object is created and added to the list * if it does not exist. * @@ -433,14 +462,15 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, if (!neigh_ifinfo) goto out; - if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) { + if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) { kfree(neigh_ifinfo); neigh_ifinfo = NULL; goto out; } INIT_HLIST_NODE(&neigh_ifinfo->list); - atomic_set(&neigh_ifinfo->refcount, 2); + kref_init(&neigh_ifinfo->refcount); + kref_get(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); @@ -459,7 +489,8 @@ out: * * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. - * Returns NULL if the neighbour is not found. + * + * Return: neighbor when found. Othwerwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -476,7 +507,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, if (tmp_neigh_node->if_incoming != hard_iface) continue; - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; res = tmp_neigh_node; @@ -492,7 +523,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * - * Returns the hardif neighbour node if found or created or NULL otherwise. + * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, @@ -508,7 +539,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, if (hardif_neigh) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) goto out; hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); @@ -522,7 +553,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, hardif_neigh->if_incoming = hard_iface; hardif_neigh->last_seen = jiffies; - atomic_set(&hardif_neigh->refcount, 1); + kref_init(&hardif_neigh->refcount); if (bat_priv->bat_algo_ops->bat_hardif_neigh_init) bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh); @@ -540,7 +571,7 @@ out: * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * - * Returns the hardif neighbour node if found or created or NULL otherwise. + * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, @@ -562,7 +593,8 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * @neigh_addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this hard interface. - * Returns NULL if the neighbour is not found. + * + * Return: neighbor when found. Othwerwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -576,7 +608,7 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr)) continue; - if (!atomic_inc_not_zero(&tmp_hardif_neigh->refcount)) + if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount)) continue; hardif_neigh = tmp_hardif_neigh; @@ -594,7 +626,8 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. - * Returns the new object or NULL on failure. + * + * Return: neighbor when found. Othwerwise NULL */ struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, @@ -617,7 +650,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, if (!neigh_node) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) { + if (!kref_get_unless_zero(&hard_iface->refcount)) { kfree(neigh_node); neigh_node = NULL; goto out; @@ -632,14 +665,15 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, neigh_node->orig_node = orig_node; /* extra reference for return */ - atomic_set(&neigh_node->refcount, 2); + kref_init(&neigh_node->refcount); + kref_get(&neigh_node->refcount); spin_lock_bh(&orig_node->neigh_list_lock); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); spin_unlock_bh(&orig_node->neigh_list_lock); /* increment unique neighbor refcount */ - atomic_inc(&hardif_neigh->refcount); + kref_get(&hardif_neigh->refcount); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", @@ -656,7 +690,7 @@ out: * @seq: neighbour table seq_file struct * @offset: not used * - * Always returns 0. + * Return: always 0 */ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) { @@ -688,12 +722,15 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) /** * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for * free after rcu grace period - * @orig_ifinfo: the orig_ifinfo object to release + * @ref: kref pointer of the orig_ifinfo */ -static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo) +static void batadv_orig_ifinfo_release(struct kref *ref) { + struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; + orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount); + if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_free_ref(orig_ifinfo->if_outgoing); @@ -712,8 +749,7 @@ static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo) */ void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - batadv_orig_ifinfo_release(orig_ifinfo); + kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** @@ -740,14 +776,17 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) /** * batadv_orig_node_release - release orig_node from lists and queue for * free after rcu grace period - * @orig_node: the orig node to free + * @ref: kref pointer of the orig_node */ -static void batadv_orig_node_release(struct batadv_orig_node *orig_node) +static void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; + orig_node = container_of(ref, struct batadv_orig_node, refcount); + spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ @@ -777,8 +816,7 @@ static void batadv_orig_node_release(struct batadv_orig_node *orig_node) */ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) { - if (atomic_dec_and_test(&orig_node->refcount)) - batadv_orig_node_release(orig_node); + kref_put(&orig_node->refcount, batadv_orig_node_release); } void batadv_originator_free(struct batadv_priv *bat_priv) @@ -820,7 +858,8 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * * Creates a new originator object and initialise all the generic fields. * The new object is not added to the originator list. - * Returns the newly created object or NULL on failure. + * + * Return: the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr) @@ -849,7 +888,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, batadv_nc_init_orig(orig_node); /* extra reference for return */ - atomic_set(&orig_node->refcount, 2); + kref_init(&orig_node->refcount); + kref_get(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); @@ -937,7 +977,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * - * Returns true if any ifinfo entry was purged, false otherwise. + * Return: true if any ifinfo entry was purged, false otherwise. */ static bool batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, @@ -989,7 +1029,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * - * Returns true if any neighbor was purged, false otherwise + * Return: true if any neighbor was purged, false otherwise */ static bool batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, @@ -1048,7 +1088,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared * - * Returns the current best neighbor, with refcount increased. + * Return: the current best neighbor, with refcount increased. */ static struct batadv_neigh_node * batadv_find_best_neighbor(struct batadv_priv *bat_priv, @@ -1064,7 +1104,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, best, if_outgoing) <= 0)) continue; - if (!atomic_inc_not_zero(&neigh->refcount)) + if (!kref_get_unless_zero(&neigh->refcount)) continue; if (best) @@ -1085,7 +1125,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, * This function checks if the orig_node or substructures of it have become * obsolete, and purges this information if that's the case. * - * Returns true if the orig_node is to be removed, false otherwise. + * Return: true if the orig_node is to be removed, false otherwise. */ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) @@ -1230,7 +1270,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) * @seq: debugfs table seq_file struct * @offset: not used * - * Returns 0 + * Return: 0 */ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) { diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index cf07304..9950740 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -20,10 +20,10 @@ #include "main.h" -#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/kref.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/stddef.h> @@ -115,7 +115,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) if (!batadv_compare_eth(orig_node, data)) continue; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) continue; orig_node_tmp = orig_node; diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 0558e32..e7f9151 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -158,7 +158,7 @@ enum batadv_tt_client_flags { }; /** - * batadv_vlan_flags - flags for the four MSB of any vlan ID field + * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not */ enum batadv_vlan_flags { @@ -209,6 +209,11 @@ struct batadv_bla_claim_dst { * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header * @flags: contains routing relevant flags - see enum batadv_iv_flags + * @seqno: sequence identification + * @orig: address of the source node + * @prev_sender: address of the previous sender + * @reserved: reserved byte for alignment + * @tq: transmission quality * @tvlv_len: length of tvlv data following the ogm header */ struct batadv_ogm_packet { @@ -230,7 +235,7 @@ struct batadv_ogm_packet { #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) /** - * batadv_icmp_header - common members among all the ICMP packets + * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -256,7 +261,7 @@ struct batadv_icmp_header { }; /** - * batadv_icmp_packet - ICMP packet + * struct batadv_icmp_packet - ICMP packet * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -282,7 +287,7 @@ struct batadv_icmp_packet { #define BATADV_RR_LEN 16 /** - * batadv_icmp_packet_rr - ICMP RouteRecord packet + * struct batadv_icmp_packet_rr - ICMP RouteRecord packet * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -345,6 +350,7 @@ struct batadv_unicast_packet { * @u: common unicast packet header * @src: address of the source * @subtype: packet subtype + * @reserved: reserved byte for alignment */ struct batadv_unicast_4addr_packet { struct batadv_unicast_packet u; @@ -413,7 +419,6 @@ struct batadv_bcast_packet { * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header - * @reserved: Align following fields to 2-byte boundaries * @first_source: original source of first included packet * @first_orig_dest: original destinal of first included packet * @first_crc: checksum of first included packet @@ -495,7 +500,7 @@ struct batadv_tvlv_gateway_data { * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container * @flags: translation table flags (see batadv_tt_data_flags) * @ttvn: translation table version number - * @vlan_num: number of announced VLANs. In the TVLV this struct is followed by + * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by * one batadv_tvlv_tt_vlan_data object per announced vlan */ struct batadv_tvlv_tt_data { diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index e4f2646..205310b 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -25,6 +25,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/jiffies.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> @@ -72,7 +73,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, rcu_read_lock(); curr_router = rcu_dereference(orig_ifinfo->router); - if (curr_router && !atomic_inc_not_zero(&curr_router->refcount)) + if (curr_router && !kref_get_unless_zero(&curr_router->refcount)) curr_router = NULL; rcu_read_unlock(); @@ -100,7 +101,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, batadv_neigh_node_free_ref(curr_router); /* increase refcount of new best neighbor */ - if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount)) + if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount)) neigh_node = NULL; spin_lock_bh(&orig_node->neigh_list_lock); @@ -140,21 +141,35 @@ out: batadv_neigh_node_free_ref(router); } -/* checks whether the host restarted and is in the protection time. - * returns: - * 0 if the packet is to be accepted +/** + * batadv_window_protected - checks whether the host restarted and is in the + * protection time. + * @bat_priv: the bat priv with all the soft interface information + * @seq_num_diff: difference between the current/received sequence number and + * the last sequence number + * @seq_old_max_diff: maximum age of sequence number not considered as restart + * @last_reset: jiffies timestamp of the last reset, will be updated when reset + * is detected + * @protection_started: is set to true if the protection window was started, + * doesn't change otherwise. + * + * Return: + * 0 if the packet is to be accepted. * 1 if the packet is to be ignored. */ int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - unsigned long *last_reset) + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started) { - if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE || + if (seq_num_diff <= -seq_old_max_diff || seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) { if (!batadv_has_timed_out(*last_reset, BATADV_RESET_PROTECTION_MS)) return 1; *last_reset = jiffies; + if (protection_started) + *protection_started = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "old packet received, start protection\n"); } @@ -198,7 +213,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, * @bat_priv: the bat priv with all the soft interface information * @skb: icmp packet to process * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, @@ -398,10 +413,11 @@ out: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. Returns negative - * value when check fails and 0 otherwise. The negative value depends on the - * reason: -ENODATA for bad header, -EBADR for broadcast destination or source, - * and -EREMOTE for non-local (other host) destination. + * Check for short header and bad addresses in given packet. + * + * Return: negative value when check fails and 0 otherwise. The negative value + * depends on the reason: -ENODATA for bad header, -EBADR for broadcast + * destination or source, and -EREMOTE for non-local (other host) destination. */ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -435,7 +451,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, * @orig_node: the destination node * @recv_if: pointer to interface this packet was received on * - * Returns the router which should be used for this orig_node on + * Return: the router which should be used for this orig_node on * this interface, or NULL if not available. */ struct batadv_neigh_node * @@ -482,14 +498,14 @@ batadv_find_router(struct batadv_priv *bat_priv, hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) { /* acquire some structures and references ... */ - if (!atomic_inc_not_zero(&cand->refcount)) + if (!kref_get_unless_zero(&cand->refcount)) continue; cand_router = rcu_dereference(cand->router); if (!cand_router) goto next; - if (!atomic_inc_not_zero(&cand_router->refcount)) { + if (!kref_get_unless_zero(&cand_router->refcount)) { cand_router = NULL; goto next; } @@ -508,8 +524,8 @@ batadv_find_router(struct batadv_priv *bat_priv, /* mark the first possible candidate */ if (!first_candidate) { - atomic_inc(&cand_router->refcount); - atomic_inc(&cand->refcount); + kref_get(&cand_router->refcount); + kref_get(&cand->refcount); first_candidate = cand; first_candidate_router = cand_router; } @@ -648,7 +664,7 @@ out: * the new corresponding information (originator address where the destination * client currently is and its known TTVN) * - * Returns true if the packet header has been updated, false otherwise + * Return: true if the packet header has been updated, false otherwise */ static bool batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, @@ -805,7 +821,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, @@ -904,9 +920,8 @@ rx_success: * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on - * @dst_addr: the payload destination * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ int batadv_recv_unicast_tvlv(struct sk_buff *skb, @@ -960,7 +975,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till * lack further fragments; 3) Merge fragments, if we have all needed parts. * - * Return NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. + * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. */ int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) @@ -1065,7 +1080,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, /* check whether the packet is old and the host just restarted. */ if (batadv_window_protected(bat_priv, seq_diff, - &orig_node->bcast_seqno_reset)) + BATADV_BCAST_MAX_AGE, + &orig_node->bcast_seqno_reset, NULL)) goto spin_unlock; /* mark broadcast in flood history, update window position diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 204bbe4..02a5caa 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -52,6 +52,7 @@ batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - unsigned long *last_reset); + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 782fa33..d8b03fd 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -111,7 +111,7 @@ send_skb_err: * host, NULL can be passed as recv_if and no interface alternating is * attempted. * - * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or + * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or * NET_XMIT_POLICED if the skb is buffered for later transmit. */ int batadv_send_skb_to_orig(struct sk_buff *skb, @@ -165,7 +165,7 @@ out: * @hdr_size: amount of bytes to push at the beginning of the skb * @orig_node: the destination node * - * Returns false if the buffer extension was not possible or true otherwise. + * Return: false if the buffer extension was not possible or true otherwise. */ static bool batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, @@ -196,7 +196,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * - * Returns false if the payload could not be encapsulated or true otherwise. + * Return: false if the payload could not be encapsulated or true otherwise. */ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, struct batadv_orig_node *orig_node) @@ -211,10 +211,10 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, * unicast 4addr header * @bat_priv: the bat priv with all the soft interface information * @skb: the skb containing the payload to encapsulate - * @orig_node: the destination node + * @orig: the destination node * @packet_subtype: the unicast 4addr packet subtype to use * - * Returns false if the payload could not be encapsulated or true otherwise. + * Return: false if the payload could not be encapsulated or true otherwise. */ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -265,7 +265,7 @@ out: * as packet_type. Then send this frame to the given orig_node and release a * reference to this orig_node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, @@ -339,7 +339,7 @@ out: * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame * to the according destination node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, @@ -373,7 +373,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, * Look up the currently selected gateway. Wrap the given skb into a batman-adv * unicast header and send this frame to this gateway node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) @@ -430,14 +430,19 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, send_time); } -/* add a broadcast packet to the queue and setup timers. broadcast packets - * are sent multiple times to increase probability for being received. +/** + * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending * - * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on - * errors. + * add a broadcast packet to the queue and setup timers. broadcast packets + * are sent multiple times to increase probability for being received. * * The skb is not consumed, so the caller should make sure that the * skb is freed. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 82059f2..7ff95ca 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -69,7 +69,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * header via the translation table. Wrap the given skb into a batman-adv * unicast header. Then send this frame to the according destination node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *dst_hint, @@ -92,7 +92,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, * unicast-4addr header. Then send this frame to the according destination * node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index ac4d08d..d4490ff 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -30,6 +30,7 @@ #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -478,22 +479,34 @@ out: } /** + * batadv_softif_vlan_release - release vlan from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the vlan object + */ +static void batadv_softif_vlan_release(struct kref *ref) +{ + struct batadv_softif_vlan *vlan; + + vlan = container_of(ref, struct batadv_softif_vlan, refcount); + + spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); + hlist_del_rcu(&vlan->list); + spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); + + kfree_rcu(vlan, rcu); +} + +/** * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and - * possibly free it - * @softif_vlan: the vlan object to release + * possibly release it + * @vlan: the vlan object to release */ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) { if (!vlan) return; - if (atomic_dec_and_test(&vlan->refcount)) { - spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); - hlist_del_rcu(&vlan->list); - spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); - - kfree_rcu(vlan, rcu); - } + kref_put(&vlan->refcount, batadv_softif_vlan_release); } /** @@ -501,7 +514,7 @@ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * - * Returns the private data of the vlan matching the vid passed as argument or + * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, @@ -514,7 +527,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, if (vlan_tmp->vid != vid) continue; - if (!atomic_inc_not_zero(&vlan_tmp->refcount)) + if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; @@ -530,7 +543,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * - * Returns 0 on success, a negative error otherwise. + * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { @@ -549,7 +562,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) vlan->bat_priv = bat_priv; vlan->vid = vid; - atomic_set(&vlan->refcount, 1); + kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); @@ -594,12 +607,13 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, /** * batadv_interface_add_vid - ndo_add_vid API implementation * @dev: the netdev of the mesh interface + * @proto: protocol of the the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * - * Returns 0 on success or a negative error code in case of failure. + * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) @@ -651,12 +665,13 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, /** * batadv_interface_kill_vid - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface + * @proto: protocol of the the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * - * Returns 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q + * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, @@ -745,7 +760,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) * batadv_softif_init_late - late stage initialization of soft interface * @dev: registered network device to modify * - * Returns error code on failures + * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { @@ -847,7 +862,7 @@ free_bat_counters: * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * - * Return 0 if successful or error otherwise. + * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev) @@ -872,7 +887,7 @@ out: * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * - * Return 0 if successful or error otherwise. + * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 8e82176..d17cfba 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fe87777..ab4382b 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/if.h> #include <linux/if_vlan.h> +#include <linux/kref.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -64,7 +65,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj) * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv * @obj: kobject to covert * - * Returns the associated batadv_priv struct. + * Return: the associated batadv_priv struct. */ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) { @@ -82,9 +83,10 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) /** * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct + * @bat_priv: the bat priv with all the soft interface information * @obj: kobject to covert * - * Returns the associated softif_vlan struct if found, NULL otherwise. + * Return: the associated softif_vlan struct if found, NULL otherwise. */ static struct batadv_softif_vlan * batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) @@ -96,7 +98,7 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) if (vlan_tmp->kobj != obj) continue; - if (!atomic_inc_not_zero(&vlan_tmp->refcount)) + if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; @@ -491,7 +493,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, * @attr: the batman-adv attribute the user is interacting with * @buff: the buffer that will contain the data to send back to the user * - * Returns the number of bytes written into 'buff' on success or a negative + * Return: the number of bytes written into 'buff' on success or a negative * error code in case of failure */ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, @@ -511,7 +513,7 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, * @buff: the buffer containing the user data * @count: number of bytes in the buffer * - * Returns 'count' on success or a negative error code in case of failure + * Return: 'count' on success or a negative error code in case of failure */ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, struct attribute *attr, char *buff, @@ -620,9 +622,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = { BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL); -/** - * batadv_vlan_attrs - array of vlan specific sysfs attributes - */ +/* array of vlan specific sysfs attributes */ static struct batadv_attribute *batadv_vlan_attrs[] = { &batadv_attr_vlan_ap_isolation, NULL, @@ -683,7 +683,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev) * @dev: netdev of the mesh interface * @vlan: private data of the newly added VLAN interface * - * Returns 0 on success and -ENOMEM if any of the structure allocations fails. + * Return: 0 on success and -ENOMEM if any of the structure allocations fails. */ int batadv_sysfs_add_vlan(struct net_device *dev, struct batadv_softif_vlan *vlan) diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 6197442..c76021b 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 0e80fd1..5c7fa02 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -31,6 +31,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -68,7 +69,15 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, unsigned short vid, const char *message, bool roaming); -/* returns 1 if they are the same mac addr and vid */ +/** + * batadv_compare_tt - check if two TT entries are the same + * @node: the list element pointer of the first TT entry + * @data2: pointer to the tt_common_entry of the second TT entry + * + * Compare the MAC address and the VLAN ID of the two TT entries and check if + * they are the same TT client. + * Return: 1 if the two TT clients are the same, 0 otherwise + */ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, @@ -84,7 +93,7 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * - * Returns the hash index where the object represented by 'data' should be + * Return: the hash index where the object represented by 'data' should be * stored at. */ static inline u32 batadv_choose_tt(const void *data, u32 size) @@ -105,7 +114,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size) * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the tt_common struct belonging to the searched client if + * Return: a pointer to the tt_common struct belonging to the searched client if * found, NULL otherwise. */ static struct batadv_tt_common_entry * @@ -133,7 +142,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, if (tt->vid != vid) continue; - if (!atomic_inc_not_zero(&tt->refcount)) + if (!kref_get_unless_zero(&tt->refcount)) continue; tt_tmp = tt; @@ -150,7 +159,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the corresponding tt_local_entry struct if the client is + * Return: a pointer to the corresponding tt_local_entry struct if the client is * found, NULL otherwise. */ static struct batadv_tt_local_entry * @@ -175,7 +184,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the corresponding tt_global_entry struct if the client + * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ static struct batadv_tt_global_entry * @@ -194,34 +203,68 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, return tt_global_entry; } +/** + * batadv_tt_local_entry_release - release tt_local_entry from lists and queue + * for free after rcu grace period + * @ref: kref pointer of the nc_node + */ +static void batadv_tt_local_entry_release(struct kref *ref) +{ + struct batadv_tt_local_entry *tt_local_entry; + + tt_local_entry = container_of(ref, struct batadv_tt_local_entry, + common.refcount); + + kfree_rcu(tt_local_entry, common.rcu); +} + +/** + * batadv_tt_local_entry_free_ref - decrement the tt_local_entry refcounter and + * possibly release it + * @tt_local_entry: tt_local_entry to be free'd + */ static void batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry) { - if (atomic_dec_and_test(&tt_local_entry->common.refcount)) - kfree_rcu(tt_local_entry, common.rcu); + kref_put(&tt_local_entry->common.refcount, + batadv_tt_local_entry_release); } /** - * batadv_tt_global_entry_free_ref - decrement the refcounter for a - * tt_global_entry and possibly free it - * @tt_global_entry: the object to free + * batadv_tt_global_entry_release - release tt_global_entry from lists and queue + * for free after rcu grace period + * @ref: kref pointer of the nc_node + */ +static void batadv_tt_global_entry_release(struct kref *ref) +{ + struct batadv_tt_global_entry *tt_global_entry; + + tt_global_entry = container_of(ref, struct batadv_tt_global_entry, + common.refcount); + + batadv_tt_global_del_orig_list(tt_global_entry); + kfree_rcu(tt_global_entry, common.rcu); +} + +/** + * batadv_tt_global_entry_free_ref - decrement the tt_global_entry refcounter + * and possibly release it + * @tt_global_entry: tt_global_entry to be free'd */ static void batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) { - if (atomic_dec_and_test(&tt_global_entry->common.refcount)) { - batadv_tt_global_del_orig_list(tt_global_entry); - kfree_rcu(tt_global_entry, common.rcu); - } + kref_put(&tt_global_entry->common.refcount, + batadv_tt_global_entry_release); } /** * batadv_tt_global_hash_count - count the number of orig entries - * @hash: hash table containing the tt entries + * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * - * Return the number of originators advertising the given address/data + * Return: the number of originators advertising the given address/data * (excluding ourself). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, @@ -286,9 +329,9 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, } /** - * batadv_tt_global_size_mod - change the size by v of the local table - * identified by vid - * @bat_priv: the bat priv with all the soft interface information + * batadv_tt_global_size_mod - change the size by v of the global table + * for orig_node identified by vid + * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier * @v: the amount to sum to the global table size */ @@ -340,22 +383,28 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, /** * batadv_tt_orig_list_entry_release - release tt orig entry from lists and * queue for free after rcu grace period - * @orig_entry: tt orig entry to be free'd + * @ref: kref pointer of the tt orig entry */ -static void -batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry) +static void batadv_tt_orig_list_entry_release(struct kref *ref) { + struct batadv_tt_orig_list_entry *orig_entry; + + orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, + refcount); + batadv_orig_node_free_ref(orig_entry->orig_node); kfree_rcu(orig_entry, rcu); } +/** + * batadv_tt_orig_list_entry_free_ref - decrement the tt orig entry refcounter + * and possibly release it + * @orig_entry: tt orig entry to be free'd + */ static void batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) { - if (!atomic_dec_and_test(&orig_entry->refcount)) - return; - - batadv_tt_orig_list_entry_release(orig_entry); + kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } /** @@ -437,7 +486,7 @@ unlock: * batadv_tt_len - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * - * Returns computed length in bytes. + * Return: computed length in bytes. */ static int batadv_tt_len(int changes_num) { @@ -448,7 +497,7 @@ static int batadv_tt_len(int changes_num) * batadv_tt_entries - compute the number of entries fitting in tt_len bytes * @tt_len: available space * - * Returns the number of entries. + * Return: the number of entries. */ static u16 batadv_tt_entries(u16 tt_len) { @@ -460,7 +509,7 @@ static u16 batadv_tt_entries(u16 tt_len) * size when transmitted over the air * @bat_priv: the bat priv with all the soft interface information * - * Returns local translation table size in bytes. + * Return: local translation table size in bytes. */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { @@ -526,7 +575,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * @mark: the value contained in the skb->mark field of the received packet (if * any) * - * Returns true if the client was successfully added, false otherwise. + * Return: true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) @@ -620,7 +669,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, tt_local->common.vid = vid; if (batadv_is_wifi_netdev(in_dev)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; - atomic_set(&tt_local->common.refcount, 2); + kref_init(&tt_local->common.refcount); + kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; @@ -721,12 +771,11 @@ out: * function reserves the amount of space needed to send the entire global TT * table. In case of success the value is updated with the real amount of * reserved bytes - * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * - * Return the size of the allocated buffer or 0 in case of failure. + * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, @@ -800,7 +849,7 @@ out: * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN. * - * Return the size of the allocated buffer or 0 in case of failure. + * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, @@ -1042,7 +1091,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, * @message: message to append to the log on deletion * @roaming: true if the deletion is due to a roaming event * - * Returns the flags assigned to the local entry before being deleted + * Return: the flags assigned to the local entry before being deleted */ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, @@ -1242,10 +1291,16 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.changes_list_lock); } -/* retrieves the orig_tt_list_entry belonging to orig_node from the +/** + * batadv_tt_global_orig_entry_find - find a TT orig_list_entry + * @entry: the TT global entry where the orig_list_entry has to be + * extracted from + * @orig_node: the originator for which the orig_list_entry has to be found + * + * retrieve the orig_tt_list_entry belonging to orig_node from the * batadv_tt_global_entry list * - * returns it with an increased refcounter, NULL if not found + * Return: it with an increased refcounter, NULL if not found */ static struct batadv_tt_orig_list_entry * batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, @@ -1259,7 +1314,7 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; - if (!atomic_inc_not_zero(&tmp_orig_entry->refcount)) + if (!kref_get_unless_zero(&tmp_orig_entry->refcount)) continue; orig_entry = tmp_orig_entry; @@ -1270,8 +1325,15 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, return orig_entry; } -/* find out if an orig_node is already in the list of a tt_global_entry. - * returns true if found, false otherwise +/** + * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled + * by a given originator + * @entry: the TT global entry to check + * @orig_node: the originator to search in the list + * + * find out if an orig_node is already in the list of a tt_global_entry. + * + * Return: true if found, false otherwise */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, @@ -1309,11 +1371,12 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, goto out; INIT_HLIST_NODE(&orig_entry->list); - atomic_inc(&orig_node->refcount); + kref_get(&orig_node->refcount); batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; - atomic_set(&orig_entry->refcount, 2); + kref_init(&orig_entry->refcount); + kref_get(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); hlist_add_head_rcu(&orig_entry->list, @@ -1343,7 +1406,7 @@ out: * * The caller must hold orig_node refcount. * - * Return true if the new entry has been added, false otherwise + * Return: true if the new entry has been added, false otherwise */ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -1389,7 +1452,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, */ if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; - atomic_set(&common->refcount, 2); + kref_init(&common->refcount); + kref_get(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); @@ -1501,7 +1565,7 @@ out: * @tt_global_entry: global translation table entry to be analyzed * * This functon assumes the caller holds rcu_read_lock(). - * Returns best originator list entry or NULL on errors. + * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * batadv_transtable_best_orig(struct batadv_priv *bat_priv, @@ -2031,7 +2095,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, * @addr: mac address of the destination client * @vid: VLAN identifier * - * Returns a pointer to the originator that was selected as destination in the + * Return: a pointer to the originator that was selected as destination in the * mesh for contacting the client 'addr', NULL otherwise. * In case of multiple originators serving the same client, the function returns * the best one (best in terms of metric towards the destination node). @@ -2071,7 +2135,7 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, /* found anything? */ if (best_entry) orig_node = best_entry->orig_node; - if (orig_node && !atomic_inc_not_zero(&orig_node->refcount)) + if (orig_node && !kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; rcu_read_unlock(); @@ -2106,7 +2170,7 @@ out: * because the XOR operation can combine them all while trying to reduce the * noise as much as possible. * - * Returns the checksum of the global table of a given originator. + * Return: the checksum of the global table of a given originator. */ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -2183,7 +2247,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, * For details about the computation, please refer to the documentation for * batadv_tt_global_crc(). * - * Returns the checksum of the local table + * Return: the checksum of the local table */ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, unsigned short vid) @@ -2289,7 +2353,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node this request is being issued for * - * Returns the pointer to the new tt_req_node struct if no request + * Return: the pointer to the new tt_req_node struct if no request * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * @@ -2324,7 +2388,7 @@ unlock: * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * - * Returns 1 if the entry is a valid, 0 otherwise. + * Return: 1 if the entry is a valid, 0 otherwise. */ static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) { @@ -2408,9 +2472,8 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries - * @create: if true, create VLAN objects if not found * - * Return true if all the received CRCs match the locally stored ones, false + * Return: true if all the received CRCs match the locally stored ones, false * otherwise */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, @@ -2513,6 +2576,8 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, * @num_vlan: number of tvlv VLAN entries * @full_table: ask for the entire translation table if true, while only for the * last TT diff otherwise + * + * Return: true if the TT Request was sent, false otherwise */ static int batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, @@ -2593,7 +2658,7 @@ out: * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2725,7 +2790,7 @@ out: * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2843,7 +2908,7 @@ out: * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2938,7 +3003,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, * @addr: the mac address of the client to check * @vid: VLAN identifier * - * Returns true if the client is served by this node, false otherwise. + * Return: true if the client is served by this node, false otherwise. */ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -3055,11 +3120,16 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.roam_list_lock); } -/* This function checks whether the client already reached the +/** + * batadv_tt_check_roam_count - check if a client has roamed too frequently + * @bat_priv: the bat priv with all the soft interface information + * @client: mac address of the roaming client + * + * This function checks whether the client already reached the * maximum number of possible roaming phases. In this case the ROAMING_ADV * will not be sent. * - * returns true if the ROAMING_ADV can be sent, false otherwise + * Return: true if the ROAMING_ADV can be sent, false otherwise */ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { @@ -3371,13 +3441,12 @@ out: * batadv_tt_update_orig - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the soft interface information - * @orig: the orig_node of the ogm - * @tt_vlan: pointer to the first tvlv VLAN entry + * @orig_node: the orig_node of the ogm + * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries * @tt_change: pointer to the first entry in the TT buffer * @tt_num_changes: number of tt changes inside the tt buffer * @ttvn: translation table version number of this changeset - * @tt_crc: crc32 checksum of orig node's translation table */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -3459,7 +3528,7 @@ request_table: * @addr: the mac address of the client to check * @vid: VLAN identifier * - * Returns true if we know that the client has moved from its old originator + * Return: true if we know that the client has moved from its old originator * to another one. This entry is still kept for consistency purposes and will be * deleted later by a DEL or because of timeout */ @@ -3485,7 +3554,7 @@ out: * @addr: the mac address of the local client to query * @vid: VLAN identifier * - * Returns true if the local client is known to be roaming (it is not served by + * Return: true if the local client is known to be roaming (it is not served by * this node anymore) or not. If yes, the client is still present in the table * to keep the latter consistent with the node TTVN */ @@ -3614,7 +3683,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * - * Returns NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS + * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, @@ -3695,7 +3764,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * - * Returns NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS + * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, @@ -3741,7 +3810,7 @@ out: * batadv_tt_init - initialise the translation table internals * @bat_priv: the bat priv with all the soft interface information * - * Return 0 on success or negative error number in case of failure. + * Return: 0 on success or negative error number in case of failure. */ int batadv_tt_init(struct batadv_priv *bat_priv) { @@ -3779,7 +3848,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * - * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false + * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index abd8e11..7c7e2c0 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 3437b66..612de23 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -25,6 +25,7 @@ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/if_ether.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/sched.h> /* for linux/wait.h */ #include <linux/spinlock.h> @@ -73,7 +74,7 @@ enum batadv_dhcp_recipient { #define BATADV_TT_SYNC_MASK 0x00F0 /** - * struct batadv_hard_iface_bat_iv - per hard interface B.A.T.M.A.N. IV data + * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data * @ogm_buff: buffer holding the OGM packet * @ogm_buff_len: length of the OGM packet buffer * @ogm_seqno: OGM sequence number - used to identify each OGM @@ -97,8 +98,8 @@ struct batadv_hard_iface_bat_iv { * batman-adv for this interface * @soft_iface: the batman-adv interface which uses this network interface * @rcu: struct used for freeing in an RCU-safe manner - * @bat_iv: BATMAN IV specific per hard interface data - * @cleanup_work: work queue callback item for hard interface deinit + * @bat_iv: per hard-interface B.A.T.M.A.N. IV data + * @cleanup_work: work queue callback item for hard-interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs * @neigh_list: list of unique single hop neighbors via this interface * @neigh_list_lock: lock protecting neigh_list @@ -110,7 +111,7 @@ struct batadv_hard_iface { struct net_device *net_dev; u8 num_bcasts; struct kobject *hardif_obj; - atomic_t refcount; + struct kref refcount; struct packet_type batman_adv_ptype; struct net_device *soft_iface; struct rcu_head rcu; @@ -125,7 +126,7 @@ struct batadv_hard_iface { /** * struct batadv_orig_ifinfo - originator info per outgoing interface * @list: list node for orig_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard interface + * @if_outgoing: pointer to outgoing hard-interface * @router: router that should be used to reach this originator * @last_real_seqno: last and best known sequence number * @last_ttl: ttl of last received packet @@ -140,7 +141,7 @@ struct batadv_orig_ifinfo { u32 last_real_seqno; u8 last_ttl; unsigned long batman_seqno_reset; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -196,13 +197,13 @@ struct batadv_orig_node_vlan { unsigned short vid; struct batadv_vlan_tt tt; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: set of bitfields (one per hard interface) where each one counts + * @bcast_own: set of bitfields (one per hard-interface) where each one counts * the number of our OGMs this orig_node rebroadcasted "back" to us (relative * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. * @bcast_own_sum: sum of bcast_own @@ -298,7 +299,7 @@ struct batadv_orig_node { struct batadv_priv *bat_priv; /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */ spinlock_t bcast_seqno_lock; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; #ifdef CONFIG_BATMAN_ADV_NC struct list_head in_coding_list; @@ -341,15 +342,16 @@ struct batadv_gw_node { struct batadv_orig_node *orig_node; u32 bandwidth_down; u32 bandwidth_up; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** - * batadv_hardif_neigh_node - unique neighbor per hard interface + * struct batadv_hardif_neigh_node - unique neighbor per hard-interface * @list: list node for batadv_hard_iface::neigh_list * @addr: the MAC address of the neighboring interface - * @if_incoming: pointer to incoming hard interface + * @if_incoming: pointer to incoming hard-interface + * @last_seen: when last packet via this neighbor was received * @refcount: number of contexts the object is used * @rcu: struct used for freeing in a RCU-safe manner */ @@ -358,7 +360,7 @@ struct batadv_hardif_neigh_node { u8 addr[ETH_ALEN]; struct batadv_hard_iface *if_incoming; unsigned long last_seen; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -369,7 +371,7 @@ struct batadv_hardif_neigh_node { * @addr: the MAC address of the neighboring interface * @ifinfo_list: list for routing metrics per outgoing interface * @ifinfo_lock: lock protecting private ifinfo members and list - * @if_incoming: pointer to incoming hard interface + * @if_incoming: pointer to incoming hard-interface * @last_seen: when last packet via this neighbor was received * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -382,13 +384,13 @@ struct batadv_neigh_node { spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; unsigned long last_seen; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing - * interface for BATMAN IV + * interface for B.A.T.M.A.N. IV * @tq_recv: ring buffer of received TQ values from this neigh node * @tq_index: ring buffer index * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) @@ -407,7 +409,7 @@ struct batadv_neigh_ifinfo_bat_iv { /** * struct batadv_neigh_ifinfo - neighbor information per outgoing interface * @list: list node for batadv_neigh_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard interface + * @if_outgoing: pointer to outgoing hard-interface * @bat_iv: B.A.T.M.A.N. IV private structure * @last_ttl: last received ttl from this neigh node * @refcount: number of contexts the object is used @@ -418,7 +420,7 @@ struct batadv_neigh_ifinfo { struct batadv_hard_iface *if_outgoing; struct batadv_neigh_ifinfo_bat_iv bat_iv; u8 last_ttl; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -744,7 +746,7 @@ struct batadv_softif_vlan { atomic_t ap_isolation; /* boolean */ struct batadv_vlan_tt tt; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -771,6 +773,9 @@ struct batadv_softif_vlan { * @orig_interval: OGM broadcast interval in milliseconds * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop * @log_level: configured log level (see batadv_dbg_level) + * @isolation_mark: the skb->mark value used to match packets for AP isolation + * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used + * for the isolation mark * @bcast_seqno: last sent broadcast packet sequence number * @bcast_queue_left: number of remaining buffered broadcast packet slots * @batman_queue_left: number of remaining OGM packet slots @@ -783,8 +788,8 @@ struct batadv_softif_vlan { * @forw_bat_list_lock: lock protecting forw_bat_list * @forw_bcast_list_lock: lock protecting forw_bcast_list * @orig_work: work queue callback item for orig node purging - * @cleanup_work: work queue callback item for soft interface deinit - * @primary_if: one of the hard interfaces assigned to this mesh interface + * @cleanup_work: work queue callback item for soft-interface deinit + * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface * @bat_algo_ops: routing algorithm used by this mesh interface * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top @@ -925,7 +930,7 @@ struct batadv_bla_backbone_gw { atomic_t request_sent; u16 crc; spinlock_t crc_lock; /* protects crc */ - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -946,7 +951,7 @@ struct batadv_bla_claim { unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; - atomic_t refcount; + struct kref refcount; }; #endif @@ -967,7 +972,7 @@ struct batadv_tt_common_entry { struct hlist_node hash_entry; u16 flags; unsigned long added_at; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1009,7 +1014,7 @@ struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1062,7 +1067,7 @@ struct batadv_tt_roam_node { struct batadv_nc_node { struct list_head list; u8 addr[ETH_ALEN]; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; struct batadv_orig_node *orig_node; unsigned long last_seen; @@ -1082,7 +1087,7 @@ struct batadv_nc_node { struct batadv_nc_path { struct hlist_node hash_entry; struct rcu_head rcu; - atomic_t refcount; + struct kref refcount; struct list_head packet_list; spinlock_t packet_list_lock; /* Protects packet_list */ u8 next_hop[ETH_ALEN]; @@ -1225,7 +1230,7 @@ struct batadv_dat_entry { unsigned short vid; unsigned long last_update; struct hlist_node hash_entry; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1261,7 +1266,7 @@ struct batadv_dat_candidate { struct batadv_tvlv_container { struct hlist_node list; struct batadv_tvlv_hdr tvlv_hdr; - atomic_t refcount; + struct kref refcount; }; /** @@ -1288,7 +1293,7 @@ struct batadv_tvlv_handler { u8 type; u8 version; u8 flags; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 74c278e..73786e2 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -41,6 +41,14 @@ fail: return -EMSGSIZE; } +static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) +{ + e->state = flags & MDB_PG_FLAGS_PERMANENT; + e->flags = 0; + if (flags & MDB_PG_FLAGS_OFFLOAD) + e->flags |= MDB_FLAGS_OFFLOAD; +} + static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { @@ -80,26 +88,41 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + port = p->port; - if (port) { - struct br_mdb_entry e; - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.state = p->state; - e.vid = p->addr.vid; - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, + MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest2); skip: @@ -209,7 +232,7 @@ static inline size_t rtnl_mdb_nlmsg_size(void) } static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, - int type) + int type, struct net_bridge_port_group *pg) { struct switchdev_obj_port_mdb mdb = { .obj = { @@ -232,10 +255,13 @@ static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, #endif mdb.obj.orig_dev = port_dev; - if (port_dev && type == RTM_NEWMDB) - switchdev_port_obj_add(port_dev, &mdb.obj); - else if (port_dev && type == RTM_DELMDB) + if (port_dev && type == RTM_NEWMDB) { + err = switchdev_port_obj_add(port_dev, &mdb.obj); + if (!err && pg) + pg->flags |= MDB_PG_FLAGS_OFFLOAD; + } else if (port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); + } skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); if (!skb) @@ -253,21 +279,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } -void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 state) +void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, + int type) { struct br_mdb_entry entry; memset(&entry, 0, sizeof(entry)); - entry.ifindex = port->dev->ifindex; - entry.addr.proto = group->proto; - entry.addr.u.ip4 = group->u.ip4; + entry.ifindex = pg->port->dev->ifindex; + entry.addr.proto = pg->addr.proto; + entry.addr.u.ip4 = pg->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - entry.addr.u.ip6 = group->u.ip6; + entry.addr.u.ip6 = pg->addr.u.ip6; #endif - entry.state = state; - entry.vid = group->vid; - __br_mdb_notify(dev, &entry, type); + entry.vid = pg->addr.vid; + __mdb_entry_fill_flags(&entry, pg->flags); + __br_mdb_notify(dev, &entry, type, pg); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, @@ -412,7 +438,8 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, unsigned char state) + struct br_ip *group, unsigned char state, + struct net_bridge_port_group **pg) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -443,6 +470,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); + *pg = p; if (state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -450,7 +478,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } static int __br_mdb_add(struct net *net, struct net_bridge *br, - struct br_mdb_entry *entry) + struct br_mdb_entry *entry, + struct net_bridge_port_group **pg) { struct br_ip ip; struct net_device *dev; @@ -479,7 +508,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, #endif spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry->state); + ret = br_mdb_add_group(br, p, &ip, entry->state, pg); spin_unlock_bh(&br->multicast_lock); return ret; } @@ -487,6 +516,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); + struct net_bridge_port_group *pg; struct net_bridge_vlan_group *vg; struct net_device *dev, *pdev; struct br_mdb_entry *entry; @@ -516,15 +546,15 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_vlan_enabled(br) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, entry); + err = __br_mdb_add(net, br, entry, &pg); if (err) break; - __br_mdb_notify(dev, entry, RTM_NEWMDB); + __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); } } else { - err = __br_mdb_add(net, br, entry); + err = __br_mdb_add(net, br, entry, &pg); if (!err) - __br_mdb_notify(dev, entry, RTM_NEWMDB); + __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); } return err; @@ -568,7 +598,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (p->port->state == BR_STATE_DISABLED) goto unlock; - entry->state = p->state; + __mdb_entry_fill_flags(entry, p->flags); rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); @@ -620,12 +650,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) entry->vid = v->vid; err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + __br_mdb_notify(dev, entry, RTM_DELMDB, NULL); } } else { err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + __br_mdb_notify(dev, entry, RTM_DELMDB, NULL); } return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 03661d9..8b6e424 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -283,8 +283,7 @@ static void br_multicast_del_pg(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, - p->state); + br_mdb_notify(br->dev, p, RTM_DELMDB); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && @@ -304,7 +303,7 @@ static void br_multicast_port_group_expired(unsigned long data) spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || - hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT) + hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; br_multicast_del_pg(br, pg); @@ -649,7 +648,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char state) + unsigned char flags) { struct net_bridge_port_group *p; @@ -659,7 +658,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->addr = *group; p->port = port; - p->state = state; + p->flags = flags; rcu_assign_pointer(p->next, next); hlist_add_head(&p->mglist, &port->mglist); setup_timer(&p->timer, br_multicast_port_group_expired, @@ -702,11 +701,11 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY); + p = br_multicast_new_port_group(port, group, *pp, 0); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY); + br_mdb_notify(br->dev, p, RTM_NEWMDB); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -975,7 +974,7 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - if (pg->state == MDB_TEMPORARY) + if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(br, pg); if (!hlist_unhashed(&port->rlist)) { @@ -1453,8 +1452,7 @@ br_multicast_leave_group(struct net_bridge *br, hlist_del_init(&p->mglist); del_timer(&p->timer); call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, port, group, RTM_DELMDB, - p->state); + br_mdb_notify(br->dev, p, RTM_DELMDB); if (!mp->ports && !mp->mglist && netif_running(br->dev)) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 40197ff..e9c635e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -598,7 +598,6 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) return -ENETDOWN; br_set_state(p, state); - br_log_state(p); br_port_state_selection(p->br); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 216018c..1b5d145 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -150,6 +150,9 @@ struct net_bridge_fdb_entry struct rcu_head rcu; }; +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) + struct net_bridge_port_group { struct net_bridge_port *port; struct net_bridge_port_group __rcu *next; @@ -157,7 +160,7 @@ struct net_bridge_port_group { struct rcu_head rcu; struct timer_list timer; struct br_ip addr; - unsigned char state; + unsigned char flags; }; struct net_bridge_mdb_entry @@ -554,11 +557,11 @@ void br_multicast_free_pg(struct rcu_head *head); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char state); + unsigned char flags); void br_mdb_init(void); void br_mdb_uninit(void); -void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 state); +void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, + int type); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); @@ -897,7 +900,6 @@ static inline void br_nf_core_fini(void) {} #endif /* br_stp.c */ -void br_log_state(const struct net_bridge_port *p); void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index b3cca12..c22816a 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -30,13 +30,6 @@ static const char *const br_port_state_names[] = { [BR_STATE_BLOCKING] = "blocking", }; -void br_log_state(const struct net_bridge_port *p) -{ - br_info(p->br, "port %u(%s) entered %s state\n", - (unsigned int) p->port_no, p->dev->name, - br_port_state_names[p->state]); -} - void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { @@ -52,6 +45,10 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); + else + br_info(p->br, "port %u(%s) entered %s state\n", + (unsigned int) p->port_no, p->dev->name, + br_port_state_names[p->state]); } /* called under bridge lock */ @@ -126,7 +123,6 @@ static void br_root_port_block(const struct net_bridge *br, (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay > 0) @@ -407,7 +403,6 @@ static void br_make_blocking(struct net_bridge_port *p) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->forward_delay_timer); @@ -431,7 +426,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay != 0) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a31ac6a..984d462 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -102,7 +102,6 @@ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); } @@ -118,7 +117,6 @@ void br_stp_disable_port(struct net_bridge_port *p) p->topology_change_ack = 0; p->config_pending = 0; - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->message_age_timer); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 5f0f5af..da058b8 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -98,7 +98,6 @@ static void br_forward_delay_timer_expired(unsigned long arg) br_topology_change_detection(br); netif_carrier_on(br->dev); } - br_log_state(p); rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); rcu_read_unlock(); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 85e43af..9309bb4 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -955,6 +955,13 @@ err_rhtbl: */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { + struct switchdev_obj_port_vlan v = { + .obj.orig_dev = port->dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; struct net_bridge_vlan *vlan; int ret; @@ -962,6 +969,10 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { + /* Pass the flags to the hardware bridge */ + ret = switchdev_port_obj_add(port->dev, &v.obj); + if (ret && ret != -EOPNOTSUPP) + return ret; __vlan_add_flags(vlan, flags); return 0; } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index fdba3d9..adc8d72 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -48,6 +48,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _oth; + struct net *net = sock_net(oldskb->sk); if (!nft_bridge_iphdr_validate(oldskb)) return; @@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->ttl = sysctl_ip_default_ttl; + niph->ttl = net->ipv4.sysctl_ip_default_ttl; niph->tot_len = htons(nskb->len); ip_send_check(niph); @@ -85,6 +86,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, void *payload; __wsum csum; u8 proto; + struct net *net = sock_net(oldskb->sk); if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) return; @@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); skb_reset_transport_header(nskb); icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); diff --git a/net/core/Makefile b/net/core/Makefile index 0b835de..7a8fb8a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o diff --git a/net/core/dev.c b/net/core/dev.c index 0ef061b..edb7179 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h) trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); } + + __kfree_skb_flush(); } if (sd->output_queue) { @@ -4154,7 +4160,10 @@ ncls: ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); + if (!deliver_exact) + atomic_long_inc(&skb->dev->rx_dropped); + else + atomic_long_inc(&skb->dev->rx_nohandler); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -5152,6 +5161,7 @@ static void net_rx_action(struct softirq_action *h) } } + __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -7253,24 +7263,31 @@ void netdev_run_todo(void) } } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { #if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); memcpy(stats64, netdev_stats, sizeof(*stats64)); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + sizeof(*netdev_stats), 0, + sizeof(*stats64) - sizeof(*netdev_stats)); #else - size_t i, n = sizeof(*stats64) / sizeof(u64); + size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); const unsigned long *src = (const unsigned long *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = src[i]; + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); #endif } EXPORT_SYMBOL(netdev_stats_to_stats64); @@ -7300,6 +7317,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); storage->tx_dropped += atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3..b5cbbe0 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 0000000..3938f3f --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,168 @@ +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index daf0470..2406101 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", }; static const char @@ -632,7 +633,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } -u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { @@ -642,6 +643,37 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -738,6 +770,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); @@ -897,6 +937,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rss_config); @@ -1227,14 +1275,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels; + struct ethtool_channels channels, max; + u32 max_rx_in_use = 0; - if (!dev->ethtool_ops->set_channels) + if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; + dev->ethtool_ops->get_channels(dev, &max); + + /* ensure new counts are within the maximums */ + if ((channels.rx_count > max.max_rx) || + (channels.tx_count > max.max_tx) || + (channels.combined_count > max.max_combined) || + (channels.other_count > max.max_other)) + return -EINVAL; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } @@ -1823,13 +1888,121 @@ out: return ret; } +static int ethtool_get_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int ret; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if (!dev->ethtool_ops->get_per_queue_coalesce) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + return ret; + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + useraddr += sizeof(coalesce); + } + + return 0; +} + +static int ethtool_set_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int i, ret = 0; + int n_queue; + struct ethtool_coalesce *backup = NULL, *tmp = NULL; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if ((!dev->ethtool_ops->set_per_queue_coalesce) || + (!dev->ethtool_ops->get_per_queue_coalesce)) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); + tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); + if (ret != 0) + goto roll_back; + + tmp++; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { + ret = -EFAULT; + goto roll_back; + } + + ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + goto roll_back; + + useraddr += sizeof(coalesce); + } + +roll_back: + if (ret != 0) { + tmp = backup; + for_each_set_bit(i, queue_mask, bit) { + dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); + tmp++; + } + } + kfree(backup); + + return ret; +} + +static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_per_queue_op per_queue_opt; + + if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) + return -EFAULT; + + switch (per_queue_opt.sub_command) { + case ETHTOOL_GCOALESCE: + return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); + case ETHTOOL_SCOALESCE: + return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); + default: + return -EOPNOTSUPP; + }; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; - u32 ethcmd; + u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; @@ -1839,8 +2012,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (ethcmd == ETHTOOL_PERQUEUE) { + if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) + return -EFAULT; + } else { + sub_cmd = ethcmd; + } /* Allow some commands to be done by anyone */ - switch (ethcmd) { + switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: @@ -2070,6 +2249,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; + case ETHTOOL_PERQUEUE: + rc = ethtool_set_per_queue(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index 94d2620..a3aba15 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -530,12 +530,14 @@ do_pass: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - /* RET_K, RET_A are remaped into 2 insns. */ + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: - *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? - BPF_K : BPF_X, BPF_REG_0, - BPF_REG_A, fp->k); + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); *insn = BPF_EXIT_INSN(); break; @@ -1181,7 +1183,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) if (bpf_prog_size(prog->len) > sysctl_optmem_max) return -ENOMEM; - if (sk_unhashed(sk)) { + if (sk_unhashed(sk) && sk->sk_reuseport) { err = reuseport_alloc(sk); if (err) return err; @@ -1333,15 +1335,22 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_LDST_LEN 16U +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); struct sk_buff *skb = (struct sk_buff *) (long) r1; int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[BPF_LDST_LEN]; void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) @@ -1355,14 +1364,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + len))) + if (unlikely(skb_try_make_writable(skb, offset + len))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, len, buf); + ptr = skb_header_pointer(skb, offset, len, sp->buff); if (unlikely(!ptr)) return -EFAULT; @@ -1371,7 +1378,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) memcpy(ptr, from, len); - if (ptr == buf) + if (ptr == sp->buff) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); @@ -1400,7 +1407,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK)) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, to); @@ -1432,9 +1439,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1474,23 +1479,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); if (unlikely(!ptr)) return -EFAULT; + if (is_mmzero && !*ptr) + return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1501,6 +1514,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; if (ptr == &sum) /* skb_store_bits guaranteed to not return -EFAULT here */ skb_store_bits(skb, offset, ptr, sizeof(sum)); @@ -1519,6 +1534,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u64 diff_size = from_size + to_size; + __be32 *from = (__be32 *) (long) r1; + __be32 *to = (__be32 *) (long) r3; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; @@ -1682,6 +1736,13 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_vlan_pop) return true; + if (func == bpf_skb_store_bytes) + return true; + if (func == bpf_l3_csum_replace) + return true; + if (func == bpf_l4_csum_replace) + return true; + return false; } @@ -1849,6 +1910,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 299cfc2..669ecc9 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -27,6 +27,31 @@ #include <net/rtnetlink.h> #include <net/ip6_fib.h> +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; @@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + rcu_read_unlock(); + request_module("rtnl-lwt-%s", encap_type_str); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + } + } +#endif if (likely(ops && ops->build_state)) ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b6c8a66..4ae17c3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -29,7 +29,6 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; -static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -574,6 +573,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] = { &dev_attr_rx_packets.attr, @@ -599,6 +599,7 @@ static struct attribute *netstat_attrs[] = { &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, NULL }; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0260c84..11fce17 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -9,7 +9,6 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f1efbc3..2ec86fc 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d735e85..62737f4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; + + a->rx_nohandler = b->rx_nohandler; } static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) @@ -1412,6 +1414,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +{ + const struct rtnl_link_ops *ops = NULL; + struct nlattr *linfo[IFLA_INFO_MAX + 1]; + + if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0) + return NULL; + + if (linfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } + + return ops; +} + +static bool link_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool link_kind_filtered(const struct net_device *dev, + const struct rtnl_link_ops *kind_ops) +{ + if (kind_ops && dev->rtnl_link_ops != kind_ops) + return true; + + return false; +} + +static bool link_dump_filtered(struct net_device *dev, + int master_idx, + const struct rtnl_link_ops *kind_ops) +{ + if (link_master_filtered(dev, master_idx) || + link_kind_filtered(dev, kind_ops)) + return true; + + return false; +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -1421,6 +1475,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; + const struct rtnl_link_ops *kind_ops = NULL; + unsigned int flags = NLM_F_MULTI; + int master_idx = 0; int err; int hdrlen; @@ -1443,18 +1500,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + if (tb[IFLA_MASTER]) + master_idx = nla_get_u32(tb[IFLA_MASTER]); + + if (tb[IFLA_LINKINFO]) + kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); + + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, + flags, ext_filter_mask); /* If we ran out of room on the first message, * we're in trouble diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5bf88f5..488566b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -349,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +#define NAPI_SKB_CACHE_SIZE 64 + +struct napi_alloc_cache { + struct page_frag_cache page; + size_t skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -380,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(nc, fragsz, gfp_mask); + return __alloc_page_frag(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -476,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; @@ -496,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(nc, len, gfp_mask); + data = __alloc_page_frag(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -507,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->pfmemalloc) + if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -749,6 +757,73 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +void __kfree_skb_flush(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* flush skb_cache if containing objects */ + if (nc->skb_count) { + kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, + nc->skb_cache); + nc->skb_count = 0; + } +} + +static inline void _kfree_skb_defer(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* drop skb->head and call any destructors for packet */ + skb_release_all(skb); + + /* record skb to CPU local list */ + nc->skb_cache[nc->skb_count++] = skb; + +#ifdef CONFIG_SLUB + /* SLUB writes into objects when freeing */ + prefetchw(skb); +#endif + + /* flush skb_cache if it is filled */ + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, + nc->skb_cache); + nc->skb_count = 0; + } +} +void __kfree_skb_defer(struct sk_buff *skb) +{ + _kfree_skb_defer(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + if (unlikely(!skb)) + return; + + /* if budget is 0 assume netpoll w/ IRQs disabled */ + if (unlikely(!budget)) { + dev_consume_skb_irq(skb); + return; + } + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); + return; + } + + _kfree_skb_defer(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + /* Make sure a field is enclosed inside headers_start/headers_end section */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ @@ -3006,8 +3081,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !head_skb->encap_hdr_csum && - !!can_checksum_protocol(features, proto); + csum = !!can_checksum_protocol(features, proto); headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3100,13 +3174,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg && !nskb->remcsum_offload) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); + if (!sg) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, len), + len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; continue; } @@ -3172,12 +3248,19 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum && !nskb->remcsum_offload) { - nskb->csum = skb_checksum(nskb, doffset, - nskb->len - doffset, 0); - nskb->ip_summed = CHECKSUM_NONE; + if (!csum) { + if (skb_has_shared_frag(nskb)) { + err = __skb_linearize(nskb); + if (err) + goto err; + } + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); @@ -4415,9 +4498,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) skb->mac_len += VLAN_HLEN; __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 6c1c8bc..46dc8ad 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1531,6 +1531,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk = NULL; goto out; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; newsk->sk_priority = 0; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 902d606..b5672e5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -802,7 +802,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) } lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, + sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index b8608b7..4663a01 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -668,7 +668,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); if (!sk) { @@ -993,7 +993,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a548be2..e0bd013 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); -static void raw_hash(struct sock *sk) +static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&raw_lock); + + return 0; } static void raw_unhash(struct sock *sk) @@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk) return container_of(sk, struct dgram_sock, sk); } -static void dgram_hash(struct sock *sk) +static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&dgram_lock); + + return 0; } static void dgram_unhash(struct sock *sk) @@ -1026,8 +1030,13 @@ static int ieee802154_create(struct net *net, struct socket *sock, /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); - if (sk->sk_prot->hash) - sk->sk_prot->hash(sk); + if (sk->sk_prot->hash) { + rc = sk->sk_prot->hash(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7758247..238225b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate + select DST_CACHE default n config NET_IPGRE @@ -405,14 +406,6 @@ config INET_XFRM_MODE_BEET If unsure, say Y. -config INET_LRO - tristate "Large Receive Offload (ipv4/tcp)" - default y - ---help--- - Support for Large Receive Offload (ipv4/tcp). - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 62c049b..bfa1336 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o -obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5c5db66..209d1ed 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -370,7 +370,11 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { @@ -1091,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p) } EXPORT_SYMBOL(inet_unregister_protosw); -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; - static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1127,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (new_saddr == old_saddr) return 0; - if (sysctl_ip_dynaddr > 1) { + if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } @@ -1142,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __sk_prot_rehash(sk); - return 0; + return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) @@ -1183,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk) * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ - if (!sysctl_ip_dynaddr || + if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59b3e0e..c102eb5 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -735,6 +735,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out; + /* + * For some 802.11 wireless deployments (and possibly other networks), + * there will be an ARP proxy and gratuitous ARP frames are attacks + * and thus should not be accepted. + */ + if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) + goto out; + /* * Special case: We must set Frame Relay source Q.922 address */ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f6303b1..29b8d3a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2185,6 +2185,8 @@ static struct devinet_sysctl_table { "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), + DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, + "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2192,6 +2194,8 @@ static struct devinet_sysctl_table { "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), + DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, + "drop_unicast_in_l2_multicast"), }, }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 976f0dc..88dab0c 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -774,7 +774,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); - uh->check = 0; udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); @@ -784,11 +783,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; __be16 sport; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -804,8 +803,8 @@ EXPORT_SYMBOL(fou_build_header); int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; __be16 sport; @@ -814,7 +813,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { - csum = false; optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; @@ -822,7 +820,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, optlen += need_priv ? GUE_LEN_PRIV : 0; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5a8ee32..003b0eb 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -18,15 +18,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); - netdev_features_t enc_features; - int ghl; - struct gre_base_hdr *greh; u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; __be16 protocol = skb->protocol; - int tnl_hlen; - bool csum; + u16 mac_len = skb->mac_len; + int gre_offset, outer_hlen; + bool need_csum; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -43,74 +41,59 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, if (!skb->encapsulation) goto out; - if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; - greh = (struct gre_base_hdr *)skb_transport_header(skb); - - ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); - if (unlikely(ghl < sizeof(*greh))) + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; - csum = !!(greh->flags & GRE_CSUM); - if (csum) - skb->encap_hdr_csum = 1; - /* setup inner skb. */ - skb->protocol = greh->protocol; skb->encapsulation = 0; - - if (unlikely(!pskb_may_pull(skb, ghl))) - goto out; - - __skb_pull(skb, ghl); + __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = skb->inner_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + skb->encap_hdr_csum = need_csum; + + features &= skb->dev->hw_enc_features; /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = skb_mac_gso_segment(skb, enc_features); + segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { - skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; } + outer_hlen = skb_tnl_header_len(skb); + gre_offset = outer_hlen - tnl_hlen; skb = segs; - tnl_hlen = skb_tnl_header_len(skb); do { - __skb_push(skb, ghl); - if (csum) { - __be32 *pcsum; - - if (skb_has_shared_frag(skb)) { - int err; - - err = __skb_linearize(skb); - if (err) { - kfree_skb_list(segs); - segs = ERR_PTR(err); - goto out; - } - } - - skb_reset_transport_header(skb); - - greh = (struct gre_base_hdr *) - skb_transport_header(skb); - pcsum = (__be32 *)(greh + 1); - *pcsum = 0; - *(__sum16 *)pcsum = gso_make_checksum(skb, 0); - } - __skb_push(skb, tnl_hlen - ghl); + struct gre_base_hdr *greh; + __be32 *pcsum; skb_reset_inner_headers(skb); skb->encapsulation = 1; - skb_reset_mac_header(skb); - skb_set_network_header(skb, mac_len); skb->mac_len = mac_len; skb->protocol = protocol; + + __skb_push(skb, outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_set_transport_header(skb, gre_offset); + + if (!need_csum) + continue; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + pcsum = (__be32 *)(greh + 1); + + *pcsum = 0; + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } while ((skb = skb->next)); out: return segs; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 05e4cba..2aea9f1 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -107,12 +107,6 @@ #include <linux/seq_file.h> #endif -#define IP_MAX_MEMBERSHIPS 20 -#define IP_MAX_MSF 10 - -/* IGMP reports for link-local multicast groups are enabled by default */ -int sysctl_igmp_llm_reports __read_mostly = 1; - #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -433,6 +427,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; + struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -440,7 +435,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; - if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || @@ -543,6 +538,7 @@ empty_source: static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; + struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { @@ -551,7 +547,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -687,7 +683,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -766,9 +762,10 @@ static void igmp_ifc_timer_expire(unsigned long data) static void igmp_ifc_event(struct in_device *in_dev) { + struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -858,12 +855,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; + struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return false; rcu_read_lock(); @@ -887,6 +885,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, __be32 group = ih->group; int max_delay; int mark = 0; + struct net *net = dev_net(in_dev->dev); if (len == 8) { @@ -972,7 +971,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&im->lock); if (im->tm_running) @@ -1088,6 +1087,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc; + struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not @@ -1102,7 +1102,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1187,6 +1187,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); int reporter; #endif @@ -1198,7 +1199,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; reporter = im->reporter; @@ -1223,6 +1224,9 @@ static void igmp_group_dropped(struct ip_mc_list *im) static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif if (im->loaded == 0) { im->loaded = 1; @@ -1232,7 +1236,7 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; if (in_dev->dead) @@ -1245,7 +1249,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1314,6 +1318,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev, void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { struct ip_mc_list *im; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); @@ -1340,7 +1347,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = sysctl_igmp_qrv; + im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1533,6 +1540,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; + struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); @@ -1540,7 +1548,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; /* a failover is happening and switches @@ -1639,6 +1647,9 @@ void ip_mc_down(struct in_device *in_dev) void ip_mc_init_dev(struct in_device *in_dev) { +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST @@ -1646,7 +1657,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1657,11 +1668,14 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); @@ -1727,11 +1741,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) /* * Join a socket to a group */ -int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; -int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; -#ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; -#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1756,6 +1765,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; + struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ @@ -1766,7 +1776,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1824,12 +1834,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1996,6 +2007,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif @@ -2007,7 +2019,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2074,7 +2086,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) count++; } err = -ENOBUFS; - if (count >= sysctl_igmp_max_memberships) + if (count >= net->ipv4.sysctl_igmp_max_memberships) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) @@ -2246,7 +2258,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } /* else, add a new source to the filter */ - if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto done; } @@ -2919,6 +2931,12 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } + /* Sysctl initialization */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6414891..d768230 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/sock_reuseport.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || - (sk2->sk_state != TCP_TIME_WAIT && + rcu_access_pointer(sk->sk_reuseport_cb) || + (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -89,161 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, attempts = 5, port = snum; + int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret, attempts = 5; struct net *net = sock_net(sk); - int smallest_size = -1, smallest_rover; + int i, low, high, attempt_half; + struct inet_bind_bucket *tb; kuid_t uid = sock_i_uid(sk); - int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; + u32 remaining, offset; - local_bh_disable(); - if (!snum) { - int remaining, rover, low, high; + if (port) { +have_port: + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; + goto tb_not_found; + } again: - inet_get_local_port_range(net, &low, &high); - if (attempt_half) { - int half = low + ((high - low) >> 1); - - if (attempt_half == 1) - high = half; - else - low = half; - } - remaining = (high - low) + 1; - smallest_rover = rover = prandom_u32() % remaining + low; - - smallest_size = -1; - do { - if (inet_is_local_reserved_port(net, rover)) - goto next_nolock; - head = &hashinfo->bhash[inet_bhashfn(net, rover, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == rover) { - if (((tb->fastreuse > 0 && - sk->sk_reuse && - sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_rover = rover; - } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = rover; - goto tb_found; - } - goto next; + attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; +other_half_scan: + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + if (high - low < 4) + attempt_half = 0; + if (attempt_half) { + int half = low + (((high - low) >> 2) << 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; + + offset = prandom_u32() % remaining; + /* __inet_hash_connect() favors ports having @low parity + * We do the opposite to not pollute connect() users. + */ + offset |= 1U; + smallest_size = -1; + smallest_port = low; /* avoid compiler warning */ + +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (((tb->fastreuse > 0 && reuse) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_port = port; } - break; - next: - spin_unlock(&head->lock); - next_nolock: - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (remaining <= 0) { - if (smallest_size != -1) { - snum = smallest_rover; - goto have_snum; - } - if (attempt_half == 1) { - /* OK we now try the upper half of the range */ - attempt_half = 2; - goto again; + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) + goto tb_found; + goto next_port; } - goto fail; - } - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { -have_snum: - head = &hashinfo->bhash[inet_bhashfn(net, snum, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == snum) - goto tb_found; + goto tb_not_found; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } + + if (smallest_size != -1) { + port = smallest_port; + goto have_port; } - tb = NULL; - goto tb_not_found; + offset--; + if (!(offset & 1)) + goto other_parity_scan; + + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto other_half_scan; + } + return ret; + +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) + goto fail_unlock; tb_found: if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) { + smallest_size == -1) goto success; - } else { - ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { - if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size != -1 && --attempts >= 0) { - spin_unlock(&head->lock); - goto again; - } - - goto fail_unlock; + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { + if ((reuse || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + smallest_size != -1 && --attempts >= 0) { + spin_unlock_bh(&head->lock); + goto again; } + goto fail_unlock; } - } -tb_not_found: - ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, - net, head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else + if (!reuse) tb->fastreuse = 0; + if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) + tb->fastreuseport = 0; + } else { + tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; tb->fastuid = uid; - } else - tb->fastreuseport = 0; - } else { - if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - if (tb->fastreuseport && - (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + } else { tb->fastreuseport = 0; + } } success: if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); @@ -482,10 +476,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); #define AF_INET_FAMILY(fam) true #endif -/* Only thing we need from tcp.h */ -extern int sysctl_tcp_synack_retries; - - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -557,6 +547,7 @@ static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; + struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; int qlen, expire = 0, resend = 0; @@ -566,7 +557,7 @@ static void reqsk_timer_handler(unsigned long data) if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -737,6 +728,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -754,13 +746,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); - return 0; + if (likely(!err)) + return 0; } sk->sk_state = TCP_CLOSE; - return -EADDRINUSE; + return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 6029157..50c0d96 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *sk; if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, + sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc5980..bc68ece 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -229,6 +233,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -246,11 +259,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -506,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->inet_num; + struct inet_timewait_sock *tw = NULL; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; + int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + struct inet_bind_bucket *tb; + u32 remaining, offset; + int ret, i, low, high; + static u32 hint; + + if (port) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + inet_ehash_nolisten(sk, NULL); + spin_unlock_bh(&head->lock); + return 0; + } + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = check_established(death_row, sk, port, NULL); + local_bh_enable(); + return ret; + } - if (!snum) { - int i, remaining, low, high, port; - static u32 hint; - u32 offset = hint + port_offset; - struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; - inet_get_local_port_range(net, &low, &high); - remaining = (high - low) + 1; + offset = (hint + port_offset) % remaining; + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ + offset &= ~1U; +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); - /* By starting with offset being an even number, - * we tend to leave about 50% of ports for other uses, - * like bind(0). + /* Does not bother with rcv_saddr checks, because + * the established check is already unique enough. */ - offset &= ~1; - - local_bh_disable(); - for (i = 0; i < remaining; i++) { - port = low + (i + offset) % remaining; - if (inet_is_local_reserved_port(net, port)) - continue; - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && - tb->port == port) { - if (tb->fastreuse >= 0 || - tb->fastreuseport >= 0) - goto next_port; - WARN_ON(hlist_empty(&tb->owners)); - if (!check_established(death_row, sk, - port, &tw)) - goto ok; + inet_bind_bucket_for_each(tb, &head->chain) { + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; - } + WARN_ON(hlist_empty(&tb->owners)); + if (!check_established(death_row, sk, + port, &tw)) + goto ok; + goto next_port; } - - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - tb->fastreuseport = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += (i + 2) & ~1; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) { + spin_unlock_bh(&head->lock); + return -ENOMEM; } - if (tw) - inet_twsk_bind_unhash(tw, hinfo); - spin_unlock(&head->lock); + tb->fastreuse = -1; + tb->fastreuseport = -1; + goto ok; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } - if (tw) - inet_twsk_deschedule_put(tw); + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; - ret = 0; - goto out; - } + return -EADDRNOTAVAIL; - head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = check_established(death_row, sk, snum, NULL); -out: - local_bh_enable(); - return ret; +ok: + hint += i + 2; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->inet_sport = htons(port); + inet_ehash_nolisten(sk, (struct sock *)tw); } + if (tw) + inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + if (tw) + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c deleted file mode 100644 index f17ea49..0000000 --- a/net/ipv4/inet_lro.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - * linux/net/ipv4/inet_lro.c - * - * Large Receive Offload (ipv4 / tcp) - * - * (C) Copyright IBM Corp. 2007 - * - * Authors: - * Jan-Bernd Themann <themann@de.ibm.com> - * Christoph Raisch <raisch@de.ibm.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/module.h> -#include <linux/if_vlan.h> -#include <linux/inet_lro.h> -#include <net/checksum.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); -MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); - -#define TCP_HDR_LEN(tcph) (tcph->doff << 2) -#define IP_HDR_LEN(iph) (iph->ihl << 2) -#define TCP_PAYLOAD_LENGTH(iph, tcph) \ - (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) - -#define IPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_W_TIMESTAMP 8 - -#define LRO_MAX_PG_HLEN 64 - -#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } - -/* - * Basic tcp checks whether packet is suitable for LRO - */ - -static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, - int len, const struct net_lro_desc *lro_desc) -{ - /* check ip header: don't aggregate padded frames */ - if (ntohs(iph->tot_len) != len) - return -1; - - if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) - return -1; - - if (iph->ihl != IPH_LEN_WO_OPTIONS) - return -1; - - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || - tcph->rst || tcph->syn || tcph->fin) - return -1; - - if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) - return -1; - - if (tcph->doff != TCPH_LEN_WO_OPTIONS && - tcph->doff != TCPH_LEN_W_TIMESTAMP) - return -1; - - /* check tcp options (only timestamp allowed) */ - if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { - __be32 *topt = (__be32 *)(tcph + 1); - - if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return -1; - - /* timestamp should be in right order */ - topt++; - if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), - ntohl(*topt))) - return -1; - - /* timestamp reply should not be zero */ - topt++; - if (*topt == 0) - return -1; - } - - return 0; -} - -static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) -{ - struct iphdr *iph = lro_desc->iph; - struct tcphdr *tcph = lro_desc->tcph; - __be32 *p; - __wsum tcp_hdr_csum; - - tcph->ack_seq = lro_desc->tcp_ack; - tcph->window = lro_desc->tcp_window; - - if (lro_desc->tcp_saw_tstamp) { - p = (__be32 *)(tcph + 1); - *(p+2) = lro_desc->tcp_rcv_tsecr; - } - - csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); - iph->tot_len = htons(lro_desc->ip_tot_len); - - tcph->check = 0; - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); - lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); - tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - lro_desc->ip_tot_len - - IP_HDR_LEN(iph), IPPROTO_TCP, - lro_desc->data_csum); -} - -static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) -{ - __wsum tcp_csum; - __wsum tcp_hdr_csum; - __wsum tcp_ps_hdr_csum; - - tcp_csum = ~csum_unfold(tcph->check); - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); - - tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - len + TCP_HDR_LEN(tcph), - IPPROTO_TCP, 0); - - return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), - tcp_ps_hdr_csum); -} - -static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - int nr_frags; - __be32 *ptr; - u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - nr_frags = skb_shinfo(skb)->nr_frags; - lro_desc->parent = skb; - lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); - lro_desc->iph = iph; - lro_desc->tcph = tcph; - lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; - lro_desc->tcp_ack = tcph->ack_seq; - lro_desc->tcp_window = tcph->window; - - lro_desc->pkt_aggr_cnt = 1; - lro_desc->ip_tot_len = ntohs(iph->tot_len); - - if (tcph->doff == 8) { - ptr = (__be32 *)(tcph+1); - lro_desc->tcp_saw_tstamp = 1; - lro_desc->tcp_rcv_tsval = *(ptr+1); - lro_desc->tcp_rcv_tsecr = *(ptr+2); - } - - lro_desc->mss = tcp_data_len; - lro_desc->active = 1; - - lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, - tcp_data_len); -} - -static inline void lro_clear_desc(struct net_lro_desc *lro_desc) -{ - memset(lro_desc, 0, sizeof(struct net_lro_desc)); -} - -static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, - struct tcphdr *tcph, int tcp_data_len) -{ - struct sk_buff *parent = lro_desc->parent; - __be32 *topt; - - lro_desc->pkt_aggr_cnt++; - lro_desc->ip_tot_len += tcp_data_len; - lro_desc->tcp_next_seq += tcp_data_len; - lro_desc->tcp_window = tcph->window; - lro_desc->tcp_ack = tcph->ack_seq; - - /* don't update tcp_rcv_tsval, would not work with PAWS */ - if (lro_desc->tcp_saw_tstamp) { - topt = (__be32 *) (tcph + 1); - lro_desc->tcp_rcv_tsecr = *(topt + 2); - } - - lro_desc->data_csum = csum_block_add(lro_desc->data_csum, - lro_tcp_data_csum(iph, tcph, - tcp_data_len), - parent->len); - - parent->len += tcp_data_len; - parent->data_len += tcp_data_len; - if (tcp_data_len > lro_desc->mss) - lro_desc->mss = tcp_data_len; -} - -static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *parent = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb_pull(skb, (skb->len - tcp_data_len)); - parent->truesize += skb->truesize; - - if (lro_desc->last_skb) - lro_desc->last_skb->next = skb; - else - skb_shinfo(parent)->frag_list = skb; - - lro_desc->last_skb = skb; -} - - -static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, - struct iphdr *iph, - struct tcphdr *tcph) -{ - if ((lro_desc->iph->saddr != iph->saddr) || - (lro_desc->iph->daddr != iph->daddr) || - (lro_desc->tcph->source != tcph->source) || - (lro_desc->tcph->dest != tcph->dest)) - return -1; - return 0; -} - -static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_arr, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc = NULL; - struct net_lro_desc *tmp; - int max_desc = lro_mgr->max_desc; - int i; - - for (i = 0; i < max_desc; i++) { - tmp = &lro_arr[i]; - if (tmp->active) - if (!lro_check_tcp_conn(tmp, iph, tcph)) { - lro_desc = tmp; - goto out; - } - } - - for (i = 0; i < max_desc; i++) { - if (!lro_arr[i].active) { - lro_desc = &lro_arr[i]; - goto out; - } - } - - LRO_INC_STATS(lro_mgr, no_desc); -out: - return lro_desc; -} - -static void lro_flush(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_desc) -{ - if (lro_desc->pkt_aggr_cnt > 1) - lro_update_tcp_ip_header(lro_desc); - - skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(lro_desc->parent); - else - netif_rx(lro_desc->parent); - - LRO_INC_STATS(lro_mgr, flushed); - lro_clear_desc(lro_desc); -} - -static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, - void *priv) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - u64 flags; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_skb_header || - lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) - goto out; - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) - goto out; - - skb->ip_summed = lro_mgr->ip_summed_aggr; - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return 0; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) - goto out2; - - lro_add_packet(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return 0; - -out2: /* send aggregated SKBs to stack */ - lro_flush(lro_mgr, lro_desc); - -out: - return 1; -} - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - void *priv) -{ - if (__lro_proc_skb(lro_mgr, skb, priv)) { - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); - } -} -EXPORT_SYMBOL(lro_receive_skb); - -void lro_flush_all(struct net_lro_mgr *lro_mgr) -{ - int i; - struct net_lro_desc *lro_desc = lro_mgr->lro_arr; - - for (i = 0; i < lro_mgr->max_desc; i++) { - if (lro_desc[i].active) - lro_flush(lro_mgr, &lro_desc[i]); - } -} -EXPORT_SYMBOL(lro_flush_all); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 187c6fc..efbd47d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -54,8 +54,6 @@ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64; static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb @@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; - qp->peer = sysctl_ipfrag_max_dist ? + qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } @@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = sysctl_ipfrag_max_dist; + unsigned int max = qp->q.net->max_dist; unsigned int start, end; int rc; @@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "ipfrag_max_dist", + .data = &init_net.ipv4.frags.max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, { } }; @@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero - }, { } }; @@ -790,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[3].data = &net->ipv4.frags.max_dist; } hdr = register_net_sysctl(net, "net/ipv4", table); @@ -865,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.frags.max_dist = 64; + res = inet_frags_init_net(&net->ipv4.frags); if (res) return res; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 41ba68d..202437d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return iptunnel_pull_header(skb, hdr_len, tpi->proto); + return iptunnel_pull_header(skb, hdr_len, tpi->proto, false); } static void ipgre_err(struct sk_buff *skb, u32 info, @@ -440,6 +440,17 @@ drop: return 0; } +static __sum16 gre_checksum(struct sk_buff *skb) +{ + __wsum csum; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csum = lco_csum(skb); + else + csum = skb_checksum(skb, 0, skb->len, 0); + return csum_fold(csum); +} + static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { @@ -467,8 +478,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); + *(__sum16 *)ptr = gre_checksum(skb); } } } @@ -493,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool csum) { - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); + return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static struct rtable *gre_get_rt(struct sk_buff *skb, @@ -531,9 +540,16 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; + rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) : + NULL; + if (!rt) { + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + if (!skb->mark) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl.saddr); + } tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d77eb0c..e3d7827 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -308,15 +308,12 @@ drop: return true; } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); - static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { @@ -362,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); - } else if (rt->rt_type == RTN_BROADCAST) + } else if (rt->rt_type == RTN_BROADCAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + } else if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + /* RFC 1122 3.3.6: + * + * When a host sends a datagram to a link-layer broadcast + * address, the IP destination address MUST be a legal IP + * broadcast or IP multicast address. + * + * A host SHOULD silently discard a datagram that is received + * via a link-layer broadcast (see Section 2.4) but does not + * specify an IP multicast or broadcast destination address. + * + * This doesn't explicitly say L2 *broadcast*, but broadcast is + * in a way a form of multicast and the most common use case for + * this is 802.11 protecting against cross-station spoofing (the + * so-called "hole-196" attack) so do it for both. + */ + if (in_dev && + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) + goto drop; + } return dst_input(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 64878ef..f734c42 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -79,9 +79,6 @@ #include <linux/netlink.h> #include <linux/tcp.h> -int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; -EXPORT_SYMBOL(sysctl_ip_default_ttl); - static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a501242..035ad64 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -573,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); int val = 0, err; bool needs_rtnl = setsockopt_needs_rtnl(optname); @@ -912,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > sysctl_igmp_max_msf) { + msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) { kfree(msf); err = -ENOBUFS; break; @@ -1067,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sysctl_igmp_max_msf) { + gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto mc_msf_out; } @@ -1342,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->tos; break; case IP_TTL: + { + struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : + net->ipv4.sysctl_ip_default_ttl : inet->uc_ttl); break; + } case IP_HDRINCL: val = inet->hdrincl; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 89e8861..dff8a05 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst, __be32 saddr) -{ - struct dst_entry *old_dst; - - dst_clone(dst); - old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); - dst_release(old_dst); - idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, - struct dst_entry *dst, __be32 saddr) -{ - __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ - tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ - int i; - - for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, - u32 cookie, __be32 *saddr) -{ - struct ip_tunnel_dst *idst; - struct dst_entry *dst; - - rcu_read_lock(); - idst = raw_cpu_ptr(t->dst_cache); - dst = rcu_dereference(idst->dst); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, cookie)) { - *saddr = idst->saddr; - } else { - tunnel_dst_reset(t); - dst_release(dst); - dst = NULL; - } - } - rcu_read_unlock(); - return (struct rtable *)dst; -} - static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { @@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -729,7 +675,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : + NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -739,7 +686,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { @@ -836,7 +784,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } @@ -975,7 +923,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1169,15 +1117,15 @@ int ip_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1207,7 +1155,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(itn, netdev_priv(dev)); - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 859d415..eaca244 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -109,13 +110,10 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->protocol = inner_proto; } - nf_reset(skb); - secpath_reset(skb); skb_clear_hash_if_not_l4(skb); - skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb, xnet); return 0; } EXPORT_SYMBOL_GPL(iptunnel_pull_header); @@ -148,7 +146,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, int gso_type_mask) { int err; @@ -166,20 +163,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } - /* If packet is not gso and we are resolving any partial checksum, - * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL - * on the outer header without confusing devices that implement - * NETIF_F_IP_CSUM with encapsulation. - */ - if (csum_help) - skb->encapsulation = 0; - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; + /* We clear encapsulation here to prevent badly-written + * drivers potentially deciding to offload an inner checksum + * if we set CHECKSUM_PARTIAL on the outer header. + * This should go away when the drivers are all fixed. + */ + skb->encapsulation = 0; + } return skb; error: diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4044da6..ec51d02 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb) if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 5fdc556..7b8fbb35 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -21,6 +21,7 @@ static struct iphdr * synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; + struct net *net = sock_net(skb->sk); skb_reset_network_header(skb); iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); @@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = sysctl_ip_default_ttl; + iph->ttl = net->ipv4.sysctl_ip_default_ttl; iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d3a2716..76dce90 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ + + return 0; } void ping_unhash(struct sock *sk) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3abd9d7..9f665b6 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7113bae..8d22de7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk) sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + + return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c..ba0dcff 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,8 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -extern int sysctl_tcp_syncookies; - static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ @@ -307,7 +305,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u8 rcv_wscale; struct flowi4 fl4; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4d367b4..1e1fe60 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -283,31 +283,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_default_ttl", - .data = &sysctl_ip_default_ttl, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &ip_ttl_min, - .extra2 = &ip_ttl_max, - }, - { - .procname = "tcp_syn_retries", - .data = &sysctl_tcp_syn_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_syn_retries_min, - .extra2 = &tcp_syn_retries_max - }, - { - .procname = "tcp_synack_retries", - .data = &sysctl_tcp_synack_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -322,51 +297,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_early_demux", - .data = &sysctl_ip_early_demux, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ip_dynaddr", - .data = &sysctl_ip_dynaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_retries1", - .data = &sysctl_tcp_retries1, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra2 = &tcp_retr1_max - }, - { - .procname = "tcp_retries2", - .data = &sysctl_tcp_retries2, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_fin_timeout", - .data = &sysctl_tcp_fin_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#ifdef CONFIG_SYN_COOKIES - { - .procname = "tcp_syncookies", - .data = &sysctl_tcp_syncookies, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -415,30 +345,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "igmp_max_memberships", - .data = &sysctl_igmp_max_memberships, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "igmp_max_msf", - .data = &sysctl_igmp_max_msf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#ifdef CONFIG_IP_MULTICAST - { - .procname = "igmp_qrv", - .data = &sysctl_igmp_qrv, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, -#endif - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -460,13 +366,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_orphan_retries", - .data = &sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fack", .data = &sysctl_tcp_fack, .maxlen = sizeof(int), @@ -481,13 +380,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "tcp_reordering", - .data = &sysctl_tcp_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_reordering", .data = &sysctl_tcp_max_reordering, .maxlen = sizeof(int), @@ -517,13 +409,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, }, { - .procname = "tcp_notsent_lowat", - .data = &sysctl_tcp_notsent_lowat, - .maxlen = sizeof(sysctl_tcp_notsent_lowat), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), @@ -845,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_dynaddr", + .data = &init_net.ipv4.sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_early_demux", + .data = &init_net.ipv4.sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_default_ttl", + .data = &init_net.ipv4.sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_ttl_min, + .extra2 = &ip_ttl_max, + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, @@ -934,12 +842,36 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "igmp_link_local_mcast_reports", - .data = &sysctl_igmp_llm_reports, + .data = &init_net.ipv4.sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "igmp_max_memberships", + .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { + .procname = "igmp_max_msf", + .data = &init_net.ipv4.sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &init_net.ipv4.sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -960,6 +892,74 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "tcp_syn_retries", + .data = &init_net.ipv4.sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_syn_retries_min, + .extra2 = &tcp_syn_retries_max + }, + { + .procname = "tcp_synack_retries", + .data = &init_net.ipv4.sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_SYN_COOKIES + { + .procname = "tcp_syncookies", + .data = &init_net.ipv4.sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "tcp_reordering", + .data = &init_net.ipv4.sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retries1", + .data = &init_net.ipv4.sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra2 = &tcp_retr1_max + }, + { + .procname = "tcp_retries2", + .data = &init_net.ipv4.sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_orphan_retries", + .data = &init_net.ipv4.sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fin_timeout", + .data = &init_net.ipv4.sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_notsent_lowat", + .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -988,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_dynaddr = 0; + net->ipv4.sysctl_ip_early_demux = 1; + return 0; err_ports: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 483ffdf..f9faadb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,8 +282,6 @@ #include <asm/unaligned.h> #include <net/busy_poll.h> -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; - int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; @@ -406,7 +404,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; u64_stats_init(&tp->syncp); - tp->reordering = sysctl_tcp_reordering; + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); @@ -1466,8 +1464,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; @@ -1657,8 +1657,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -2326,6 +2328,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -2522,7 +2525,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > sysctl_tcp_fin_timeout / HZ) + else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) tp->linger2 = 0; else tp->linger2 = val * HZ; @@ -2639,6 +2642,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + int notsent_bytes; u64 rate64; u32 rate; @@ -2719,6 +2723,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; + + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + info->tcpi_notsent_bytes = max(0, notsent_bytes); + + info->tcpi_min_rtt = tcp_min_rtt(tp); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2727,6 +2736,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int val, len; if (get_user(len, optlen)) @@ -2761,12 +2771,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : sysctl_tcp_fin_timeout) / HZ; + val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 55be6ac..fdb286d 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -124,6 +124,41 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } + +/* If an incoming SYN or SYNACK frame contains a payload and/or FIN, + * queue this additional data / FIN. + */ +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdrlen(skb)); + skb_set_owner_r(skb, sk); + + TCP_SKB_CB(skb)->seq++; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; + + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ + tp->bytes_received = skb->len; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_fin(sk); +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct dst_entry *dst, @@ -132,7 +167,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; - u32 end_seq; bool own_req; req->num_retrans = 0; @@ -178,35 +212,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. - * We used to play tricky games with skb_get(). - * With lockless listener, it is a dead end. - * Do not think about it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now, - * (any reason not to?) but no need to queue the skb since - * there is no data. How about SYN+FIN? - */ - end_seq = TCP_SKB_CB(skb)->end_seq; - if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - if (likely(skb2)) { - skb_dst_drop(skb2); - __skb_pull(skb2, tcp_hdrlen(skb)); - skb_set_owner_r(skb2, child); - __skb_queue_tail(&child->sk_receive_queue, skb2); - tp->syn_data_acked = 1; - - /* u64_stats_update_begin(&tp->syncp) not needed here, - * as we certainly are not changing upper 32bit value (0) - */ - tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; - } else { - end_seq = TCP_SKB_CB(skb)->seq + 1; - } - } - tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + + tcp_fastopen_add_skb(child, skb); + + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b2c8e9..e6e65f7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; -int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; int sysctl_tcp_max_reordering __read_mostly = 300; -EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -126,6 +124,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) +#define REXMIT_NONE 0 /* no loss recovery to do */ +#define REXMIT_LOST 1 /* retransmit packets marked lost */ +#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ + /* Adapt the MSS value used to make delayed ack decision to the * real world. */ @@ -1210,6 +1212,7 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; + tp->delivered += pcount; /* Out-of-order packets delivered */ fack_count += pcount; @@ -1821,8 +1824,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) static void tcp_add_reno_sack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + tp->sacked_out++; tcp_check_reno_reordering(sk, 0); + if (tp->sacked_out > prior_sacked) + tp->delivered++; /* Some out-of-order packet is delivered */ tcp_verify_left_out(tp); } @@ -1834,6 +1841,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ + tp->delivered += max_t(int, acked - tp->sacked_out, 1); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -1873,6 +1881,7 @@ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ @@ -1923,9 +1932,9 @@ void tcp_enter_loss(struct sock *sk) * suggests that the degree of reordering is over-estimated. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= sysctl_tcp_reordering) + tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + net->ipv4.sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -2109,6 +2118,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; + int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2123,7 +2133,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) */ packets_out = tp->packets_out; if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. @@ -2467,14 +2477,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit, int flag) +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - int newly_acked_sacked = prior_unsacked - - (tp->packets_out - tp->sacked_out); if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; @@ -2492,7 +2500,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, } else { sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2537,7 +2546,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) +static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2551,8 +2560,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2662,7 +2669,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, + int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2684,10 +2692,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; - __tcp_push_pending_frames(sk, tcp_current_mss(sk), - TCP_NAGLE_OFF); - if (after(tp->snd_nxt, tp->high_seq)) - return; /* Step 2.b */ + /* Step 2.b. Try send new data (but deferred until cwnd + * is updated in tcp_ack()). Otherwise fall back to + * the conventional recovery. + */ + if (tcp_send_head(sk) && + after(tcp_wnd_end(tp), tp->snd_nxt)) { + *rexmit = REXMIT_NEW; + return; + } tp->frto = 0; } } @@ -2706,12 +2719,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked, int flag) +static bool tcp_try_undo_partial(struct sock *sk, const int acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2726,10 +2738,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * can undo. Otherwise we clock out new packets but do not * mark more packets lost or retransmit more. */ - if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); + if (tp->retrans_out) return true; - } if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; @@ -2748,21 +2758,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * - * Besides that it does CWND reduction, when packet loss is detected - * and changes state of machine. + * Besides that it updates the congestion state when packet loss or ECN + * is detected. But it does not reduce the cwnd, it is done by the + * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - const int prior_unsacked, - bool is_dupack, int flag) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2809,8 +2819,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, /* Use RACK to detect loss */ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) + tcp_rack_mark_lost(sk)) { flag |= FLAG_LOST_RETRANS; + *ack_flag |= FLAG_LOST_RETRANS; + } /* E. Process state. */ switch (icsk->icsk_ca_state) { @@ -2819,7 +2831,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) + if (tcp_try_undo_partial(sk, acked)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2831,7 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack); + tcp_process_loss(sk, flag, is_dupack, rexmit); if (icsk->icsk_ca_state != TCP_CA_Open && !(flag & FLAG_LOST_RETRANS)) return; @@ -2848,7 +2860,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, prior_unsacked); + tcp_try_to_open(sk, flag); return; } @@ -2870,8 +2882,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Kathleen Nichols' algorithm for tracking the minimum value of @@ -3096,7 +3107,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, + u32 prior_snd_una, int *acked, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3154,10 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_ORIG_SACK_ACKED; } - if (sacked & TCPCB_SACKED_ACKED) + if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; - else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } else if (tcp_is_sack(tp)) { + tp->delivered += acked_pcount; + if (!tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3266,6 +3280,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif + *acked = pkts_acked; return flag; } @@ -3299,21 +3314,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - if (tcp_in_cwnd_reduction(sk)) - return false; - /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; } +/* The "ultimate" congestion control function that aims to replace the rigid + * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). + * It's called toward the end of processing an ACK with precise rate + * information. All transmission or retransmission are delayed afterwards. + */ +static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, + int flag) +{ + if (tcp_in_cwnd_reduction(sk)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, acked_sacked, flag); + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + tcp_cong_avoid(sk, ack, acked_sacked); + } + tcp_update_pacing_rate(sk); +} + /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ @@ -3509,6 +3539,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags) icsk->icsk_ca_ops->in_ack_event(sk, flags); } +/* Congestion control has updated the cwnd already. So if we're in + * loss recovery then now we do any new sends (for FRTO) or + * retransmits (for CA_Loss or CA_recovery) that make sense. + */ +static void tcp_xmit_recovery(struct sock *sk, int rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (rexmit == REXMIT_NONE) + return; + + if (unlikely(rexmit == 2)) { + __tcp_push_pending_frames(sk, tcp_current_mss(sk), + TCP_NAGLE_OFF); + if (after(tp->snd_nxt, tp->high_seq)) + return; + tp->frto = 0; + } + tcp_xmit_retransmit_queue(sk); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3521,8 +3572,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - const int prior_unsacked = tp->packets_out - tp->sacked_out; + u32 prior_delivered = tp->delivered; int acked = 0; /* Number of packets newly acked */ + int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ sack_state.first_sackt.v64 = 0; @@ -3611,23 +3663,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); - acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3636,14 +3681,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_update_pacing_rate(sk); + tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3666,8 +3711,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_xmit_recovery(sk, rexmit); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -3998,7 +4043,7 @@ void tcp_reset(struct sock *sk) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ -static void tcp_fin(struct sock *sk) +void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5512,6 +5557,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + + tcp_fastopen_add_skb(sk, synack); + return false; } @@ -6118,9 +6166,10 @@ static bool tcp_syn_flood_action(const struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; + struct net *net = sock_net(sk); #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { + if (net->ipv4.sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = true; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6129,7 +6178,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && - sysctl_tcp_syncookies != 2 && + net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -6162,6 +6211,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; struct request_sock *req; @@ -6172,7 +6222,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * limitations, they conserve resources and peer is * evidently real one. */ - if ((sysctl_tcp_syncookies == 2 || + if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); if (!want_cookie) @@ -6238,7 +6288,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } } /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && + else if (!net->ipv4.sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 487ac67..4c8d58d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -642,8 +642,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, - &tcp_hashinfo, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, + ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ @@ -865,7 +865,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } - #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of @@ -1587,7 +1586,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->sacked = 0; lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest); if (!sk) goto no_tcp_socket; @@ -1703,7 +1703,8 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, + &tcp_hashinfo, skb, + __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -2395,6 +2396,16 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; + net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; + net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; + net->ipv4.sysctl_tcp_syncookies = 1; + net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; + net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; + net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; + net->ipv4.sysctl_tcp_orphan_retries = 0; + net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c8cbc2b..c26241f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; @@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk) if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && - tp->reordering != sysctl_tcp_reordering) + tp->reordering != net->ipv4.sysctl_tcp_reordering) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 75632a9..fadd8b9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -27,9 +27,6 @@ #include <net/inet_common.h> #include <net/xfrm.h> -int sysctl_tcp_syncookies __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_syncookies); - int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9864a2d..773083b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->fin = th->psh = 0; th->check = newcheck; - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); seq += mss; @@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); out: return segs; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fda379c..7d2c7a4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; -EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -3476,6 +3473,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); unsigned long probe_max; int err; @@ -3489,7 +3487,7 @@ void tcp_send_probe0(struct sock *sk) } if (err <= 0) { - if (icsk->icsk_backoff < sysctl_tcp_retries2) + if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; probe_max = TCP_RTO_MAX; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a4730a2..49bc474 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,11 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; -int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; -int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; -int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_err(struct sock *sk) @@ -82,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* Calculate maximal number or retries on an orphaned socket. */ static int tcp_orphan_retries(struct sock *sk, bool alive) { - int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (sk->sk_err_soft && !alive) @@ -157,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int retry_until; bool do_reset, syn_set = false; @@ -169,10 +165,10 @@ static int tcp_write_timeout(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } - retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts with @@ -181,7 +177,7 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_data_acked && tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == sysctl_tcp_retries1) + if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -191,7 +187,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } - retry_until = sysctl_tcp_retries2; + retry_until = net->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -305,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk) (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) goto abort; - max_probes = sysctl_tcp_retries2; + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -332,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : - sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -360,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); if (tp->fastopen_rsk) { @@ -490,7 +487,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); out:; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 95d2f19..836abe5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -356,8 +356,8 @@ EXPORT_SYMBOL(udp_lib_get_port); * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -848,32 +848,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, { struct udphdr *uh = udp_hdr(skb); - if (nocheck) + if (nocheck) { uh->check = 0; - else if (skb_is_gso(skb)) + } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v4_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp_set_csum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4c519c1..56c4c8b 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -32,42 +32,56 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features), __be16 new_protocol, bool is_ipv6) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); + bool remcsum, need_csum, offload_csum; + struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; - netdev_features_t enc_features; + u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum = !!(skb_shinfo(skb)->gso_type & - SKB_GSO_UDP_TUNNEL_CSUM); - bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); - bool offload_csum = false, dont_encap = (need_csum || remcsum); - - oldlen = (u16)~skb->len; + u32 partial; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; + /* adjust partial header checksum to negate old length */ + partial = (__force u32)uh->check + (__force u16)~uh->len; + + /* setup inner skb. */ skb->encapsulation = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); skb->encap_hdr_csum = need_csum; + + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; /* Try to offload checksum if possible */ offload_csum = !!(need_csum && - ((skb->dev->features & NETIF_F_HW_CSUM) || - (skb->dev->features & (is_ipv6 ? - NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); + (skb->dev->features & + (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : + (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags and + * instead set the flag based on our outer checksum offload value. + */ + if (remcsum) { + features &= ~NETIF_F_CSUM_MASK; + if (offload_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = gso_inner_segment(skb, enc_features); + segs = gso_inner_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -78,17 +92,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, udp_offset = outer_hlen - tnl_hlen; skb = segs; do { - struct udphdr *uh; - int len; - __be32 delta; + __be16 len; - if (dont_encap) { - skb->encapsulation = 0; + if (remcsum) skb->ip_summed = CHECKSUM_NONE; - } else { - /* Only set up inner headers if we might be offloading - * inner checksum. - */ + + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } @@ -96,43 +106,28 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = mac_len; skb->protocol = protocol; - skb_push(skb, outer_hlen); + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; + len = htons(skb->len - udp_offset); uh = udp_hdr(skb); - uh->len = htons(len); + uh->len = len; if (!need_csum) continue; - delta = htonl(oldlen + len); - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - if (offload_csum) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - } else if (remcsum) { - /* Need to calculate checksum from scratch, - * inner checksums are never when doing - * remote_checksum_offload. - */ - - skb->csum = skb_checksum(skb, udp_offset, - skb->len - udp_offset, - 0); - uh->check = csum_fold(skb->csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { - uh->check = gso_make_checksum(skb, ~uh->check); + ((__force u32)len + partial)); + if (skb->encapsulation || !offload_csum) { + uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); } } while ((skb = skb->next)); out: diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 40c8975..11e875f 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -207,6 +207,7 @@ config IPV6_NDISC_NODETYPE config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL + select DST_CACHE ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index bdd7eac..4751f89 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4714,6 +4714,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; + array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; + array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; } static inline size_t inet6_ifla6_size(void) @@ -5788,6 +5790,20 @@ static struct addrconf_sysctl_table .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { + .procname = "drop_unicast_in_l2_multicast", + .data = &ipv6_devconf.drop_unicast_in_l2_multicast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_unsolicited_na", + .data = &ipv6_devconf.drop_unsolicited_na, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { /* sentinel */ } }, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9f5137c..b11c37c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -235,7 +235,11 @@ lookup_protocol: * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 32dc9aa..3061305 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -99,5 +99,6 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); +MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 36c3f01..532c3ef 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -26,6 +26,7 @@ #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> +#include <net/sock_reuseport.h> int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) @@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 21ace5a..70f2628 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -17,11 +17,13 @@ #include <linux/module.h> #include <linux/random.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -121,7 +123,9 @@ static inline int compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { @@ -129,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net, const struct hlist_nulls_node *node; struct sock *result; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -146,6 +151,15 @@ begin: if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -163,11 +177,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -177,6 +193,7 @@ begin: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) @@ -184,7 +201,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sock *sk; local_bh_disable(); - sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + ntohs(dport), dif); local_bh_enable(); return sk; @@ -274,3 +292,59 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); + +int inet6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + local_bh_enable(); + } + + return 0; +} +EXPORT_SYMBOL_GPL(inet6_hash); + +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 9a4d732..8f92058 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -98,27 +98,16 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb, uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v6_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index a69aad1..f7c9560 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(tunnel); + dst = dst_cache_get(&tunnel->dst_cache); if (!dst) { dst = ip6_route_output(net, NULL, fl6); @@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(tunnel, ndst); + dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); proto = NEXTHDR_GRE; @@ -1009,7 +1009,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; } @@ -1219,7 +1219,7 @@ static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1257,7 +1257,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(tunnel); + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 9075acf..c05c425 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); @@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) goto err; + /* If enabled, drop unicast packets that were encapsulated in link-layer + * multicast or broadcast to protected against the so-called "hole-196" + * attack in 802.11 wireless. + */ + if (!ipv6_addr_is_multicast(&hdr->daddr) && + (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) && + idev->cnf.drop_unicast_in_l2_multicast) + goto err; + /* RFC4291 2.7 * Nodes must not originate a packet to a multicast address whose scope * field contains the reserved value 0; if such a packet is received, it diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca4..3f3aabd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - write_seqlock_bh(&idst->lock); - dst_release(rcu_dereference_protected( - idst->dst, - lockdep_is_held(&idst->lock.lock))); - if (dst) { - dst_hold(dst); - idst->cookie = rt6_get_cookie((struct rt6_info *)dst); - } else { - idst->cookie = 0; - } - rcu_assign_pointer(idst->dst, dst); - write_sequnlock_bh(&idst->lock); -} - -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) -{ - struct ip6_tnl_dst *idst; - struct dst_entry *dst; - unsigned int seq; - u32 cookie; - - idst = raw_cpu_ptr(t->dst_cache); - - rcu_read_lock(); - do { - seq = read_seqbegin(&idst->lock); - dst = rcu_dereference(idst->dst); - cookie = idst->cookie; - } while (read_seqretry(&idst->lock, seq)); - - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - rcu_read_unlock(); - - if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { - ip6_tnl_per_cpu_dst_set(idst, NULL); - dst_release(dst); - dst = NULL; - } - return dst; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); - -void ip6_tnl_dst_reset(struct ip6_tnl *t) -{ - int i; - - for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); - -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) -{ - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); - -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); - -void ip6_tnl_dst_destroy(struct ip6_tnl *t) -{ - if (!t->dst_cache) - return; - - ip6_tnl_dst_reset(t); - free_percpu(t->dst_cache); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); - -int ip6_tnl_dst_init(struct ip6_tnl *t) -{ - int i; - - t->dst_cache = alloc_percpu(struct ip6_tnl_dst); - if (!t->dst_cache) - return -ENOMEM; - - for_each_possible_cpu(i) - seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); - - return 0; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); - /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(t); + dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(t, ndst); + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); skb->transport_header = skb->network_header; @@ -1366,7 +1275,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; } @@ -1637,7 +1546,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(t); + ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0a8610b..d90a11f 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 84afb9a..c245895 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb) offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; @@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb) return; } + /* For some 802.11 wireless deployments (and possibly other networks), + * there will be a NA proxy and unsolicitd packets are attacks + * and thus should not be accepted. + */ + if (!msg->icmph.icmp6_solicited && idev && + idev->cnf.drop_unsolicited_na) + return; + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 18f3498..e2ea311 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -496,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + skb_postpush_rcsum(head, skb_network_header(head), + skb_network_header_len(head)); rcu_read_lock(); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2066d1c..f45b8ff 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); dev_put(dev); } @@ -740,7 +740,7 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -911,7 +911,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT); if (IS_ERR(skb)) { ip_rt_put(rt); goto out; @@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; @@ -1093,7 +1093,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) t->parms.link = p->link; ipip6_tunnel_bind_dev(t->dev); } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } @@ -1124,7 +1124,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } @@ -1278,7 +1278,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); break; @@ -1339,7 +1339,7 @@ static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1372,6 +1372,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + int err; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1382,10 +1383,10 @@ static int ipip6_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } return 0; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 2906ef2..0e393ff 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -148,7 +148,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst; __u8 rcv_wscale; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5c8c842..33f2820 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -867,7 +867,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, &ipv6h->saddr, + &tcp_hashinfo, NULL, 0, + &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) @@ -1376,8 +1377,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, - inet6_iif(skb)); + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1501,6 +1502,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif(skb)); @@ -1866,7 +1868,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 22e28a4..0711f8f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <net/addrconf.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> @@ -77,49 +78,6 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only - */ -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { - if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) - return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) - return match_wildcard; - } - return 0; - } - - if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} - static u32 udp6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) @@ -590,6 +548,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); struct sock *sk; + int harderr; int err; struct net *net = dev_net(skb->dev); @@ -601,26 +560,27 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } + harderr = icmpv6_err_convert(type, code, &err); + np = inet6_sk(sk); + if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); + if (np->pmtudisc != IPV6_PMTUDISC_DONT) + harderr = 1; } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); goto out; } - np = inet6_sk(sk); - - if (!icmpv6_err_convert(type, code, &err) && !np->recverr) - goto out; - - if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) - goto out; - - if (np->recverr) + if (!np->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + } sk->sk_err = err; sk->sk_error_report(sk); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index a2c8747..6b54ff3 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -25,6 +25,7 @@ #include <net/udp.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> @@ -718,7 +719,7 @@ static struct proto l2tp_ip6_prot = { .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), #ifdef CONFIG_COMPAT diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8dab4e5..b3c52e3 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; -static int llc_ui_wait_for_conn(struct sock *sk, long timeout); +static long llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); @@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) return rc; } -static int llc_ui_wait_for_conn(struct sock *sk, long timeout) +static long llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT(wait); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index fb31aa8..644a8da 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -227,5 +227,6 @@ static void __exit mpls_iptunnel_exit(void) } module_exit(mpls_iptunnel_exit); +MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3264cb49..a3f5cd9 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1019,8 +1019,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -1112,8 +1111,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)); if (IS_ERR(skb)) goto tx_error; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 857ae89..2278d9a 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -127,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) -{ - return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); -} -EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); - int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 8ca9320..11f81c8 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -330,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC); + skb = alloc_skb(n, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = nfnetlink_alloc_skb(net, pkt_size, - peer_portid, GFP_ATOMIC); + skb = alloc_skb(pkt_size, GFP_ATOMIC); } } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1d39365..7542999 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0, rem_len = 0; + size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -361,7 +361,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; - rem_len = data_len - hlen; break; } @@ -386,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, - GFP_ATOMIC); + skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); return NULL; diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 3ab591e..7f4414d 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) * belonging to established connections going through that one. */ static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet_lookup_listener(net, &tcp_hashinfo, skb, + ip_hdrlen(skb) + + __tcp_hdrlen(tcph), saddr, sport, daddr, dport, in->ifindex); @@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, #ifdef XT_TPROXY_HAVE_IPV6 static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + thoff + __tcp_hdrlen(tcph), saddr, sport, daddr, ntohs(dport), in->ifindex); @@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); @@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(par->net, tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, @@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, - &iph->saddr, laddr, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, + tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2ec08f0..49d14ec 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb, * box. */ static struct sock * -xt_socket_get_sock_v4(struct net *net, const u8 protocol, +xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, + return __inet_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -148,6 +149,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); u8 uninitialized_var(protocol); @@ -169,6 +172,10 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, sport = hp->source; daddr = iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, @@ -198,8 +205,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, } #endif - return xt_socket_get_sock_v4(net, protocol, saddr, daddr, - sport, dport, indev); + return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, + daddr, sport, dport, indev); } static bool @@ -318,14 +325,15 @@ extract_icmp6_fields(const struct sk_buff *skb, } static struct sock * -xt_socket_get_sock_v6(struct net *net, const u8 protocol, +xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, &tcp_hashinfo, + return inet6_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -343,6 +351,8 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; int thoff = 0, tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); @@ -362,6 +372,10 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, sport = hp->source; daddr = &iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = tproto == IPPROTO_TCP ? + thoff + __tcp_hdrlen((struct tcphdr *)hp) : + thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { struct ipv6hdr ipv6_var; @@ -373,7 +387,7 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, return NULL; } - return xt_socket_get_sock_v6(net, tproto, saddr, daddr, + return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig index 2c5e95e..5d6e8c0 100644 --- a/net/netlink/Kconfig +++ b/net/netlink/Kconfig @@ -2,15 +2,6 @@ # Netlink Sockets # -config NETLINK_MMAP - bool "NETLINK: mmaped IO" - ---help--- - This option enables support for memory mapped netlink IO. This - reduces overhead by avoiding copying data between kernel- and - userspace. - - If unsure, say N. - config NETLINK_DIAG tristate "NETLINK: socket monitoring interface" default n diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1ffb34..c841679 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, dev_hold(dev); - if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk) wake_up_interruptible(&nlk->wait); } -#ifdef CONFIG_NETLINK_MMAP -static bool netlink_rx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->rx_ring.pg_vec != NULL; -} - -static bool netlink_tx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->tx_ring.pg_vec != NULL; -} - -static __pure struct page *pgvec_to_page(const void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - else - return virt_to_page(addr); -} - -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; i++) { - if (pg_vec[i] != NULL) { - if (is_vmalloc_addr(pg_vec[i])) - vfree(pg_vec[i]); - else - free_pages((unsigned long)pg_vec[i], order); - } - } - kfree(pg_vec); -} - -static void *alloc_one_pg_vec_page(unsigned long order) -{ - void *buffer; - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | - __GFP_NOWARN | __GFP_NORETRY; - - buffer = (void *)__get_free_pages(gfp_flags, order); - if (buffer != NULL) - return buffer; - - buffer = vzalloc((1 << order) * PAGE_SIZE); - if (buffer != NULL) - return buffer; - - gfp_flags &= ~__GFP_NORETRY; - return (void *)__get_free_pages(gfp_flags, order); -} - -static void **alloc_pg_vec(struct netlink_sock *nlk, - struct nl_mmap_req *req, unsigned int order) -{ - unsigned int block_nr = req->nm_block_nr; - unsigned int i; - void **pg_vec; - - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); - if (pg_vec == NULL) - return NULL; - - for (i = 0; i < block_nr; i++) { - pg_vec[i] = alloc_one_pg_vec_page(order); - if (pg_vec[i] == NULL) - goto err1; - } - - return pg_vec; -err1: - free_pg_vec(pg_vec, order, block_nr); - return NULL; -} - - -static void -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, - unsigned int order) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct sk_buff_head *queue; - struct netlink_ring *ring; - - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); -} - -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool tx_ring) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - void **pg_vec = NULL; - unsigned int order = 0; - - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - - if (req->nm_block_nr) { - if (ring->pg_vec != NULL) - return -EBUSY; - - if ((int)req->nm_block_size <= 0) - return -EINVAL; - if (!PAGE_ALIGNED(req->nm_block_size)) - return -EINVAL; - if (req->nm_frame_size < NL_MMAP_HDRLEN) - return -EINVAL; - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) - return -EINVAL; - - ring->frames_per_block = req->nm_block_size / - req->nm_frame_size; - if (ring->frames_per_block == 0) - return -EINVAL; - if (ring->frames_per_block * req->nm_block_nr != - req->nm_frame_nr) - return -EINVAL; - - order = get_order(req->nm_block_size); - pg_vec = alloc_pg_vec(nlk, req, order); - if (pg_vec == NULL) - return -ENOMEM; - } else { - if (req->nm_frame_nr) - return -EINVAL; - } - - mutex_lock(&nlk->pg_vec_lock); - if (atomic_read(&nlk->mapped) == 0) { - __netlink_set_ring(sk, req, tx_ring, pg_vec, order); - mutex_unlock(&nlk->pg_vec_lock); - return 0; - } - - mutex_unlock(&nlk->pg_vec_lock); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); - - return -EBUSY; -} - -static void netlink_mm_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_inc(&nlk_sk(sk)->mapped); -} - -static void netlink_mm_close(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_dec(&nlk_sk(sk)->mapped); -} - -static const struct vm_operations_struct netlink_mmap_ops = { - .open = netlink_mm_open, - .close = netlink_mm_close, -}; - -static int netlink_mmap(struct file *file, struct socket *sock, - struct vm_area_struct *vma) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - unsigned long start, size, expected; - unsigned int i; - int err = -EINVAL; - - if (vma->vm_pgoff) - return -EINVAL; - - mutex_lock(&nlk->pg_vec_lock); - - expected = 0; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; - } - - if (expected == 0) - goto out; - - size = vma->vm_end - vma->vm_start; - if (size != expected) - goto out; - - start = vma->vm_start; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - - for (i = 0; i < ring->pg_vec_len; i++) { - struct page *page; - void *kaddr = ring->pg_vec[i]; - unsigned int pg_num; - - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { - page = pgvec_to_page(kaddr); - err = vm_insert_page(vma, start, page); - if (err < 0) - goto out; - start += PAGE_SIZE; - kaddr += PAGE_SIZE; - } - } - } - - atomic_inc(&nlk->mapped); - vma->vm_ops = &netlink_mmap_ops; - err = 0; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) -{ -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 - struct page *p_start, *p_end; - - /* First page is flushed through netlink_{get,set}_status */ - p_start = pgvec_to_page(hdr + PAGE_SIZE); - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } -#endif -} - -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) -{ - smp_rmb(); - flush_dcache_page(pgvec_to_page(hdr)); - return hdr->nm_status; -} - -static void netlink_set_status(struct nl_mmap_hdr *hdr, - enum nl_mmap_status status) -{ - smp_mb(); - hdr->nm_status = status; - flush_dcache_page(pgvec_to_page(hdr)); -} - -static struct nl_mmap_hdr * -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) -{ - unsigned int pg_vec_pos, frame_off; - - pg_vec_pos = pos / ring->frames_per_block; - frame_off = pos % ring->frames_per_block; - - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); -} - -static struct nl_mmap_hdr * -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, - enum nl_mmap_status status) -{ - struct nl_mmap_hdr *hdr; - - hdr = __netlink_lookup_frame(ring, pos); - if (netlink_get_status(hdr) != status) - return NULL; - - return hdr; -} - -static struct nl_mmap_hdr * -netlink_current_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - return netlink_lookup_frame(ring, ring->head, status); -} - -static void netlink_increment_head(struct netlink_ring *ring) -{ - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; -} - -static void netlink_forward_ring(struct netlink_ring *ring) -{ - unsigned int head = ring->head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, ring->head); - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) - break; - if (hdr->nm_status != NL_MMAP_STATUS_SKIP) - break; - netlink_increment_head(ring); - } while (ring->head != head); -} - -static bool netlink_has_valid_frame(struct netlink_ring *ring) -{ - unsigned int head = ring->head, pos = head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, pos); - if (hdr->nm_status == NL_MMAP_STATUS_VALID) - return true; - pos = pos != 0 ? pos - 1 : ring->frame_max; - } while (pos != head); - - return false; -} - -static bool netlink_dump_space(struct netlink_sock *nlk) -{ - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - unsigned int n; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - return false; - - n = ring->head + ring->frame_max / 2; - if (n > ring->frame_max) - n -= ring->frame_max; - - hdr = __netlink_lookup_frame(ring, n); - - return hdr->nm_status == NL_MMAP_STATUS_UNUSED; -} - -static unsigned int netlink_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int mask; - int err; - - if (nlk->rx_ring.pg_vec != NULL) { - /* Memory mapped sockets don't call recvmsg(), so flow control - * for dumps is performed here. A dump is allowed to continue - * if at least half the ring is unused. - */ - while (nlk->cb_running && netlink_dump_space(nlk)) { - err = netlink_dump(sk); - if (err < 0) { - sk->sk_err = -err; - sk->sk_error_report(sk); - break; - } - } - netlink_rcv_wake(sk); - } - - mask = datagram_poll(file, sock, wait); - - /* We could already have received frames in the normal receive - * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, - * so if mask contains pollin/etc already, there's no point - * walking the ring. - */ - if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - if (netlink_has_valid_frame(&nlk->rx_ring)) - mask |= POLLIN | POLLRDNORM; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } - - spin_lock_bh(&sk->sk_write_queue.lock); - if (nlk->tx_ring.pg_vec) { - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLOUT | POLLWRNORM; - } - spin_unlock_bh(&sk->sk_write_queue.lock); - - return mask; -} - -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) -{ - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); -} - -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, - struct netlink_ring *ring, - struct nl_mmap_hdr *hdr) -{ - unsigned int size; - void *data; - - size = ring->frame_size - NL_MMAP_HDRLEN; - data = (void *)hdr + NL_MMAP_HDRLEN; - - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->len = 0; - - skb->destructor = netlink_skb_destructor; - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; - NETLINK_CB(skb).sk = sk; -} - -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, - u32 dst_portid, u32 dst_group, - struct scm_cookie *scm) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - struct sk_buff *skb; - unsigned int maxlen; - int err = 0, len = 0; - - mutex_lock(&nlk->pg_vec_lock); - - ring = &nlk->tx_ring; - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - - do { - unsigned int nm_len; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); - if (hdr == NULL) { - if (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending)) - schedule(); - continue; - } - - nm_len = ACCESS_ONCE(hdr->nm_len); - if (nm_len > maxlen) { - err = -EINVAL; - goto out; - } - - netlink_frame_flush_dcache(hdr, nm_len); - - skb = alloc_skb(nm_len, GFP_KERNEL); - if (skb == NULL) { - err = -ENOBUFS; - goto out; - } - __skb_put(skb, nm_len); - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - - netlink_increment_head(ring); - - NETLINK_CB(skb).portid = nlk->portid; - NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).creds = scm->creds; - - err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; - } - - if (unlikely(dst_group)) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_portid, dst_group, - GFP_KERNEL); - } - err = netlink_unicast(sk, skb, dst_portid, - msg->msg_flags & MSG_DONTWAIT); - if (err < 0) - goto out; - len += err; - - } while (hdr != NULL || - (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending))); - - if (len > 0) - err = len; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) -{ - struct nl_mmap_hdr *hdr; - - hdr = netlink_mmap_hdr(skb); - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_frame_flush_dcache(hdr, hdr->nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; - kfree_skb(skb); -} - -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - - spin_lock_bh(&sk->sk_receive_queue.lock); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - kfree_skb(skb); - netlink_overrun(sk); - return; - } - netlink_increment_head(ring); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_set_status(hdr, NL_MMAP_STATUS_COPY); -} - -#else /* CONFIG_NETLINK_MMAP */ -#define netlink_rx_is_mmaped(sk) false -#define netlink_tx_is_mmaped(sk) false -#define netlink_mmap sock_no_mmap -#define netlink_poll datagram_poll -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0 -#endif /* CONFIG_NETLINK_MMAP */ - static void netlink_skb_destructor(struct sk_buff *skb) { -#ifdef CONFIG_NETLINK_MMAP - struct nl_mmap_hdr *hdr; - struct netlink_ring *ring; - struct sock *sk; - - /* If a packet from the kernel to userspace was freed because of an - * error without being delivered to userspace, the kernel must reset - * the status. In the direction userspace to kernel, the status is - * always reset here after the packet was processed and freed. - */ - if (netlink_skb_is_mmaped(skb)) { - hdr = netlink_mmap_hdr(skb); - sk = NETLINK_CB(skb).sk; - - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - ring = &nlk_sk(sk)->tx_ring; - } else { - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { - hdr->nm_len = 0; - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - } - ring = &nlk_sk(sk)->rx_ring; - } - - WARN_ON(atomic_read(&ring->pending) == 0); - atomic_dec(&ring->pending); - sock_put(sk); - - skb->head = NULL; - } -#endif if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) @@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); -#ifdef CONFIG_NETLINK_MMAP - if (1) { - struct nl_mmap_req req; - - memset(&req, 0, sizeof(req)); - if (nlk->rx_ring.pg_vec) - __netlink_set_ring(sk, &req, false, NULL, 0); - memset(&req, 0, sizeof(req)); - if (nlk->tx_ring.pg_vec) - __netlink_set_ring(sk, &req, true, NULL, 0); - } -#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); -#ifdef CONFIG_NETLINK_MMAP - mutex_init(&nlk->pg_vec_lock); -#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_S_CONGESTED, &nlk->state)) && - !netlink_skb_is_mmaped(skb)) { + test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) netlink_deliver_tap(skb); -#ifdef CONFIG_NETLINK_MMAP - if (netlink_skb_is_mmaped(skb)) - netlink_queue_mmaped_skb(sk, skb); - else if (netlink_rx_is_mmaped(sk)) - netlink_ring_set_copied(sk, skb); - else -#endif /* CONFIG_NETLINK_MMAP */ - skb_queue_tail(&sk->sk_receive_queue, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } @@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) int delta; WARN_ON(skb->sk != NULL); - if (netlink_skb_is_mmaped(skb)) - return skb; - delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; @@ -1876,79 +1248,6 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, - unsigned int ldiff, u32 dst_portid, - gfp_t gfp_mask) -{ -#ifdef CONFIG_NETLINK_MMAP - unsigned int maxlen, linear_size; - struct sock *sk = NULL; - struct sk_buff *skb; - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - - sk = netlink_getsockbyportid(ssk, dst_portid); - if (IS_ERR(sk)) - goto out; - - ring = &nlk_sk(sk)->rx_ring; - /* fast-path without atomic ops for common case: non-mmaped receiver */ - if (ring->pg_vec == NULL) - goto out_put; - - /* We need to account the full linear size needed as a ring - * slot cannot have non-linear parts. - */ - linear_size = size + ldiff; - if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) - goto out_put; - - skb = alloc_skb_head(gfp_mask); - if (skb == NULL) - goto err1; - - spin_lock_bh(&sk->sk_receive_queue.lock); - /* check again under lock */ - if (ring->pg_vec == NULL) - goto out_free; - - /* check again under lock */ - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < linear_size) - goto out_free; - - netlink_forward_ring(ring); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - goto err2; - - netlink_ring_setup_skb(skb, sk, ring, hdr); - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); - atomic_inc(&ring->pending); - netlink_increment_head(ring); - - spin_unlock_bh(&sk->sk_receive_queue.lock); - return skb; - -err2: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - netlink_overrun(sk); -err1: - sock_put(sk); - return NULL; - -out_free: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); -out_put: - sock_put(sk); -out: -#endif - return alloc_skb(size, gfp_mask); -} -EXPORT_SYMBOL_GPL(__netlink_alloc_skb); - int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -2225,8 +1524,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && - optlen >= sizeof(int) && + if (optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -2279,25 +1577,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } err = 0; break; -#ifdef CONFIG_NETLINK_MMAP - case NETLINK_RX_RING: - case NETLINK_TX_RING: { - struct nl_mmap_req req; - - /* Rings might consume more memory than queue limits, require - * CAP_NET_ADMIN. - */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (optlen < sizeof(req)) - return -EINVAL; - if (copy_from_user(&req, optval, sizeof(req))) - return -EFAULT; - err = netlink_set_ring(sk, &req, - optname == NETLINK_TX_RING); - break; - } -#endif /* CONFIG_NETLINK_MMAP */ case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; @@ -2467,18 +1746,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) smp_rmb(); } - /* It's a really convoluted way for userland to ask for mmaped - * sendmsg(), but that's what we've got... - */ - if (netlink_tx_is_mmaped(sk) && - iter_is_iovec(&msg->msg_iter) && - msg->msg_iter.nr_segs == 1 && - msg->msg_iter.iov->iov_base == NULL) { - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, - &scm); - goto out; - } - err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; @@ -2794,8 +2061,7 @@ static int netlink_dump(struct sock *sk) goto errout_skb; } - if (!netlink_rx_is_mmaped(sk) && - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being @@ -2808,15 +2074,12 @@ static int netlink_dump(struct sock *sk) if (alloc_min_size < nlk->max_recvmsg_len) { alloc_size = nlk->max_recvmsg_len; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); + skb = alloc_skb(alloc_size, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL); + skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; @@ -2831,8 +2094,7 @@ static int netlink_dump(struct sock *sk) * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ - if (!netlink_rx_is_mmaped(sk)) - skb_reserve(skb, skb_tailroom(skb) - alloc_size); + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -2884,16 +2146,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct netlink_sock *nlk; int ret; - /* Memory mapped dump requests need to be copied to avoid looping - * on the pending state in netlink_mmap_sendmsg() while the CB hold - * a reference to the skb. - */ - if (netlink_skb_is_mmaped(skb)) { - skb = skb_copy(skb, GFP_KERNEL); - if (skb == NULL) - return -ENOBUFS; - } else - atomic_inc(&skb->users); + atomic_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { @@ -2966,8 +2219,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); - skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), - NETLINK_CB(in_skb).portid, GFP_KERNEL); + skb = nlmsg_new(payload, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -3241,7 +2493,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = netlink_poll, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -3249,7 +2501,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = netlink_mmap, + .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 14437d9..e68ef9c 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -44,12 +44,6 @@ struct netlink_sock { int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); struct module *module; -#ifdef CONFIG_NETLINK_MMAP - struct mutex pg_vec_lock; - struct netlink_ring rx_ring; - struct netlink_ring tx_ring; - atomic_t mapped; -#endif /* CONFIG_NETLINK_MMAP */ struct rhash_head node; struct rcu_head rcu; @@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } -static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ -#ifdef CONFIG_NETLINK_MMAP - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -#else - return false; -#endif /* CONFIG_NETLINK_MMAP */ -} - struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 3ee63a3cf..8dd836a 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -8,41 +8,6 @@ #include "af_netlink.h" -#ifdef CONFIG_NETLINK_MMAP -static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, - struct sk_buff *nlskb) -{ - struct netlink_diag_ring ndr; - - ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; - ndr.ndr_block_nr = ring->pg_vec_len; - ndr.ndr_frame_size = ring->frame_size; - ndr.ndr_frame_nr = ring->frame_max + 1; - - return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); -} - -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - int ret; - - mutex_lock(&nlk->pg_vec_lock); - ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); - if (!ret) - ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, - nlskb); - mutex_unlock(&nlk->pg_vec_lock); - - return ret; -} -#else -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - return 0; -} -#endif - static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; - if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && - sk_diag_put_rings_cfg(sk, skb)) - goto out_nlmsg_trim; - nlmsg_end(skb, nlh); return 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f830326..a09132a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -463,26 +463,6 @@ int genl_unregister_family(struct genl_family *family) EXPORT_SYMBOL(genl_unregister_family); /** - * genlmsg_new_unicast - Allocate generic netlink message for unicast - * @payload: size of the message payload - * @info: information on destination - * @flags: the type of memory to allocate - * - * Allocates a new sk_buff large enough to cover the specified payload - * plus required Netlink headers. Will check receiving socket for - * memory mapped i/o capability and use it if enabled. Will fall back - * to non-mapped skb if message size exceeds the frame size of the ring. - */ -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags) -{ - size_t len = nlmsg_total_size(genlmsg_total_size(payload)); - - return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); -} -EXPORT_SYMBOL_GPL(genlmsg_new_unicast); - -/** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to @@ -580,6 +560,10 @@ static int genl_family_rcv_msg(struct genl_family *family, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { int rc; @@ -638,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; - info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d143aa9..cd5fd9d 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -10,6 +10,7 @@ config OPENVSWITCH select LIBCRC32C select MPLS select NET_MPLS_GSO + select DST_CACHE ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2d59df5..e9dd47b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, - MPLS_HLEN, 0)); + skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); hdr = eth_hdr(skb); hdr->h_proto = mpls->mpls_ethertype; @@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, mask->eth_dst); - ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); @@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); memcpy(skb->data, &data->l2_data, data->l2_len); - ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); skb_reset_mac_header(skb); ovs_vport_send(vport, skb); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index deadfda..c4e8455 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; - struct genl_info info = { - .dst_sk = ovs_dp_get_net(dp)->genl_sock, - .snd_portid = upcall_info->portid, - }; size_t len; unsigned int hlen; int err, dp_ifindex; @@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, hlen = skb->len; len = upcall_msg_size(upcall_info, hlen); - user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -654,7 +650,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = packet_policy, .doit = ovs_packet_cmd_execute } @@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); - skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -1391,12 +1387,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_del }, @@ -1407,7 +1403,7 @@ static const struct genl_ops dp_flow_genl_ops[] = { .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_set, }, @@ -1481,9 +1477,9 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +static struct sk_buff *ovs_dp_cmd_alloc_info(void) { - return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); + return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ @@ -1536,7 +1532,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1657,7 +1653,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1690,7 +1686,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1723,7 +1719,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1777,12 +1773,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_del }, @@ -1793,7 +1789,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = { .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_set, }, @@ -2158,12 +2154,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_del }, @@ -2174,7 +2170,7 @@ static const struct genl_ops dp_vport_genl_ops[] = { .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_set, }, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1bd4a4..58b8efc 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (!tun_dst) return -ENOMEM; + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, sizeof(*ovs_tun), log); if (IS_ERR(a)) { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 6a6adf3..4e39723 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb) return; skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index c10899cb..f01f28a 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv) int ovs_vport_receive(struct vport *, struct sk_buff *, const struct ip_tunnel_info *); -static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); -} - static inline const char *ovs_vport_name(struct vport *vport) { return vport->dev->name; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 992396a..b7e7851 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1960,6 +1960,64 @@ static unsigned int run_filter(struct sk_buff *skb, return res; } +static int __packet_rcv_vnet(const struct sk_buff *skb, + struct virtio_net_hdr *vnet_hdr) +{ + *vnet_hdr = (const struct virtio_net_hdr) { 0 }; + + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + /* This is a hint as to how much should be linear. */ + vnet_hdr->hdr_len = + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); + vnet_hdr->gso_size = + __cpu_to_virtio16(vio_le(), sinfo->gso_size); + + if (sinfo->gso_type & SKB_GSO_TCPV4) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; + else if (sinfo->gso_type & SKB_GSO_FCOE) + return -EINVAL; + else + BUG(); + + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(), + skb_checksum_start_offset(skb)); + vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(), + skb->csum_offset); + } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; + } /* else everything is zero */ + + return 0; +} + +static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, + size_t *len) +{ + struct virtio_net_hdr vnet_hdr; + + if (*len < sizeof(vnet_hdr)) + return -EINVAL; + *len -= sizeof(vnet_hdr); + + if (__packet_rcv_vnet(skb, &vnet_hdr)) + return -EINVAL; + + return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); +} + /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. @@ -2148,7 +2206,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + - po->tp_reserve; + po->tp_reserve; + if (po->has_vnet_hdr) + netoff += sizeof(struct virtio_net_hdr); macoff = netoff - maclen; } if (po->tp_version <= TPACKET_V2) { @@ -2185,7 +2245,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) - goto ring_is_full; + goto drop_n_account; if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* @@ -2204,6 +2264,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } spin_unlock(&sk->sk_receive_queue.lock); + if (po->has_vnet_hdr) { + if (__packet_rcv_vnet(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr))) { + spin_lock(&sk->sk_receive_queue.lock); + goto drop_n_account; + } + } + skb_copy_bits(skb, 0, h.raw + macoff, snaplen); if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) @@ -2299,7 +2367,7 @@ drop: kfree_skb(skb); return 0; -ring_is_full: +drop_n_account: po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); @@ -2347,15 +2415,92 @@ static void tpacket_set_protocol(const struct net_device *dev, } } +static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) +{ + unsigned short gso_type = 0; + + if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) + vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); + + if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) + return -EINVAL; + + if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + gso_type = SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + gso_type = SKB_GSO_UDP; + break; + default: + return -EINVAL; + } + + if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + gso_type |= SKB_GSO_TCP_ECN; + + if (vnet_hdr->gso_size == 0) + return -EINVAL; + } + + vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */ + return 0; +} + +static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, + struct virtio_net_hdr *vnet_hdr) +{ + int n; + + if (*len < sizeof(*vnet_hdr)) + return -EINVAL; + *len -= sizeof(*vnet_hdr); + + n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter); + if (n != sizeof(*vnet_hdr)) + return -EFAULT; + + return __packet_snd_vnet_parse(vnet_hdr, *len); +} + +static int packet_snd_vnet_gso(struct sk_buff *skb, + struct virtio_net_hdr *vnet_hdr) +{ + if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset); + + if (!skb_partial_csum_set(skb, s, o)) + return -EINVAL; + } + + skb_shinfo(skb)->gso_size = + __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size); + skb_shinfo(skb)->gso_type = vnet_hdr->gso_type; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + return 0; +} + static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, - void *frame, struct net_device *dev, int size_max, - __be16 proto, unsigned char *addr, int hlen) + void *frame, struct net_device *dev, void *data, int tp_len, + __be16 proto, unsigned char *addr, int hlen, int copylen) { union tpacket_uhdr ph; - int to_write, offset, len, tp_len, nr_frags, len_max; + int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; - void *data; int err; ph.raw = frame; @@ -2367,51 +2512,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; - switch (po->tp_version) { - case TPACKET_V2: - tp_len = ph.h2->tp_len; - break; - default: - tp_len = ph.h1->tp_len; - break; - } - if (unlikely(tp_len > size_max)) { - pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); - return -EMSGSIZE; - } - skb_reserve(skb, hlen); skb_reset_network_header(skb); - if (unlikely(po->tp_tx_has_off)) { - int off_min, off_max, off; - off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); - off_max = po->tx_ring.frame_size - tp_len; - if (sock->type == SOCK_DGRAM) { - switch (po->tp_version) { - case TPACKET_V2: - off = ph.h2->tp_net; - break; - default: - off = ph.h1->tp_net; - break; - } - } else { - switch (po->tp_version) { - case TPACKET_V2: - off = ph.h2->tp_mac; - break; - default: - off = ph.h1->tp_mac; - break; - } - } - if (unlikely((off < off_min) || (off_max < off))) - return -EINVAL; - data = ph.raw + off; - } else { - data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); - } to_write = tp_len; if (sock->type == SOCK_DGRAM) { @@ -2419,20 +2522,17 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; - } else if (dev->hard_header_len) { - if (ll_header_truncated(dev, tp_len)) - return -EINVAL; - + } else if (copylen) { skb_push(skb, dev->hard_header_len); - err = skb_store_bits(skb, 0, data, - dev->hard_header_len); + skb_put(skb, copylen - dev->hard_header_len); + err = skb_store_bits(skb, 0, data, copylen); if (unlikely(err)) return err; if (!skb->protocol) tpacket_set_protocol(dev, skb); - data += dev->hard_header_len; - to_write -= dev->hard_header_len; + data += copylen; + to_write -= copylen; } offset = offset_in_page(data); @@ -2469,10 +2569,66 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, return tp_len; } +static int tpacket_parse_header(struct packet_sock *po, void *frame, + int size_max, void **data) +{ + union tpacket_uhdr ph; + int tp_len, off; + + ph.raw = frame; + + switch (po->tp_version) { + case TPACKET_V2: + tp_len = ph.h2->tp_len; + break; + default: + tp_len = ph.h1->tp_len; + break; + } + if (unlikely(tp_len > size_max)) { + pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); + return -EMSGSIZE; + } + + if (unlikely(po->tp_tx_has_off)) { + int off_min, off_max; + + off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); + off_max = po->tx_ring.frame_size - tp_len; + if (po->sk.sk_type == SOCK_DGRAM) { + switch (po->tp_version) { + case TPACKET_V2: + off = ph.h2->tp_net; + break; + default: + off = ph.h1->tp_net; + break; + } + } else { + switch (po->tp_version) { + case TPACKET_V2: + off = ph.h2->tp_mac; + break; + default: + off = ph.h1->tp_mac; + break; + } + } + if (unlikely((off < off_min) || (off_max < off))) + return -EINVAL; + } else { + off = po->tp_hdrlen - sizeof(struct sockaddr_ll); + } + + *data = frame + off; + return tp_len; +} + static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb; struct net_device *dev; + struct virtio_net_hdr *vnet_hdr = NULL; __be16 proto; int err, reserve = 0; void *ph; @@ -2480,9 +2636,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int tp_len, size_max; unsigned char *addr; + void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; - int hlen, tlen; + int hlen, tlen, copylen = 0; mutex_lock(&po->pg_vec_lock); @@ -2515,7 +2672,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if (size_max > dev->mtu + reserve + VLAN_HLEN) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) size_max = dev->mtu + reserve + VLAN_HLEN; do { @@ -2527,11 +2684,36 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) continue; } + skb = NULL; + tp_len = tpacket_parse_header(po, ph, size_max, &data); + if (tp_len < 0) + goto tpacket_error; + status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; + if (po->has_vnet_hdr) { + vnet_hdr = data; + data += sizeof(*vnet_hdr); + tp_len -= sizeof(*vnet_hdr); + if (tp_len < 0 || + __packet_snd_vnet_parse(vnet_hdr, tp_len)) { + tp_len = -EINVAL; + goto tpacket_error; + } + copylen = __virtio16_to_cpu(vio_le(), + vnet_hdr->hdr_len); + } + if (dev->hard_header_len) { + if (ll_header_truncated(dev, tp_len)) { + tp_len = -EINVAL; + goto tpacket_error; + } + copylen = max_t(int, copylen, dev->hard_header_len); + } skb = sock_alloc_send_skb(&po->sk, - hlen + tlen + sizeof(struct sockaddr_ll), + hlen + tlen + sizeof(struct sockaddr_ll) + + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { @@ -2540,14 +2722,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) err = len_sum; goto out_status; } - tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, - addr, hlen); + tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, + addr, hlen, copylen); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && + !po->has_vnet_hdr && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { +tpacket_error: if (po->tp_loss) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); @@ -2561,6 +2745,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } + if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) { + tp_len = -EINVAL; + goto tpacket_error; + } + packet_pick_tx_queue(dev, skb); skb->destructor = tpacket_destruct_skb; @@ -2643,12 +2832,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; - int vnet_hdr_len; struct packet_sock *po = pkt_sk(sk); - unsigned short gso_type = 0; int hlen, tlen; int extra_len = 0; - ssize_t n; /* * Get and verify the address. @@ -2686,53 +2872,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { - vnet_hdr_len = sizeof(vnet_hdr); - - err = -EINVAL; - if (len < vnet_hdr_len) - goto out_unlock; - - len -= vnet_hdr_len; - - err = -EFAULT; - n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter); - if (n != vnet_hdr_len) - goto out_unlock; - - if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); - - err = -EINVAL; - if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); + if (err) goto out_unlock; - - if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - gso_type = SKB_GSO_TCPV4; - break; - case VIRTIO_NET_HDR_GSO_TCPV6: - gso_type = SKB_GSO_TCPV6; - break; - case VIRTIO_NET_HDR_GSO_UDP: - gso_type = SKB_GSO_UDP; - break; - default: - goto out_unlock; - } - - if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) - gso_type |= SKB_GSO_TCP_ECN; - - if (vnet_hdr.gso_size == 0) - goto out_unlock; - - } } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2744,7 +2886,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } err = -EMSGSIZE; - if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) + if (!vnet_hdr.gso_type && + (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; @@ -2775,7 +2918,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (!gso_type && (len > dev->mtu + reserve + extra_len) && + if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; @@ -2789,24 +2932,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { - if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); - if (!skb_partial_csum_set(skb, s, o)) { - err = -EINVAL; - goto out_free; - } - } - - skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); - skb_shinfo(skb)->gso_type = gso_type; - - /* Header must be checked, and gso_segs computed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; - - len += vnet_hdr_len; + err = packet_snd_vnet_gso(skb, &vnet_hdr); + if (err) + goto out_free; + len += sizeof(vnet_hdr); } skb_probe_transport_header(skb, reserve); @@ -3177,51 +3306,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_has_room(pkt_sk(sk), NULL); if (pkt_sk(sk)->has_vnet_hdr) { - struct virtio_net_hdr vnet_hdr = { 0 }; - - err = -EINVAL; - vnet_hdr_len = sizeof(vnet_hdr); - if (len < vnet_hdr_len) - goto out_free; - - len -= vnet_hdr_len; - - if (skb_is_gso(skb)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - /* This is a hint as to how much should be linear. */ - vnet_hdr.hdr_len = - __cpu_to_virtio16(vio_le(), skb_headlen(skb)); - vnet_hdr.gso_size = - __cpu_to_virtio16(vio_le(), sinfo->gso_size); - if (sinfo->gso_type & SKB_GSO_TCPV4) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; - else if (sinfo->gso_type & SKB_GSO_FCOE) - goto out_free; - else - BUG(); - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), - skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), - skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ - - err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len); - if (err < 0) + err = packet_rcv_vnet(msg, skb, &len); + if (err) goto out_free; + vnet_hdr_len = sizeof(struct virtio_net_hdr); } /* You lose any data beyond the buffer you gave. If it worries @@ -3552,8 +3640,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv } if (optlen < len) return -EINVAL; - if (pkt_sk(sk)->has_vnet_hdr) - return -EINVAL; if (copy_from_user(&req_u.req, optval, len)) return -EFAULT; return packet_set_ring(sk, &req_u, 0, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index d575ef4..ffd5f22 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) rcu_read_unlock(); } -void pn_sock_hash(struct sock *sk) +int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); + + return 0; } EXPORT_SYMBOL(pn_sock_hash); @@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) pn->resource = spn->spn_resource; /* Enable RX on the socket */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9d6ddba..ad60299 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -37,7 +37,6 @@ #include <net/tcp.h> #include <net/net_namespace.h> #include <net/netns/generic.h> -#include <net/tcp.h> #include "rds.h" #include "tcp.h" diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b07c535..eeb3eb3 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -105,9 +105,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, int hl = ihl + jhl; if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || - (skb_cloned(skb) && - !skb_clone_writable(skb, hl + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, hl + ntkoff)) return NULL; else return (void *)(skb_network_header(skb) + ihl); @@ -365,9 +363,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff)) goto fail; ip_send_check(ip_hdr(skb)); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b7c4ead..27607b8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -126,9 +126,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, addr = iph->daddr; if (!((old_addr ^ addr) & mask)) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + noff)) goto drop; new_addr &= mask; @@ -156,9 +154,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcphdr *tcph; if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff)) goto drop; tcph = (void *)(skb_network_header(skb) + ihl); @@ -171,9 +167,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct udphdr *udph; if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*udph) + noff)) goto drop; udph = (void *)(skb_network_header(skb) + ihl); @@ -213,10 +207,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, if ((old_addr ^ addr) & mask) break; - if (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*icmph) + - sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, ihl + sizeof(*icmph) + + sizeof(*iph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4fbb674..d54bc94 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -43,6 +43,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <linux/netdevice.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -424,6 +425,93 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } +static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (dev->netdev_ops->ndo_setup_tc) { + offload.cls_u32->command = TC_CLSU32_DELETE_KNODE; + offload.cls_u32->knode.handle = handle; + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (dev->netdev_ops->ndo_setup_tc) { + offload.cls_u32->command = TC_CLSU32_NEW_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (dev->netdev_ops->ndo_setup_tc) { + offload.cls_u32->command = TC_CLSU32_DELETE_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (dev->netdev_ops->ndo_setup_tc) { + offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE; + offload.cls_u32->knode.handle = n->handle; + offload.cls_u32->knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + offload.cls_u32->knode.val = n->val; + offload.cls_u32->knode.mask = n->mask; +#else + offload.cls_u32->knode.val = 0; + offload.cls_u32->knode.mask = 0; +#endif + offload.cls_u32->knode.sel = &n->sel; + offload.cls_u32->knode.exts = &n->exts; + if (n->ht_down) + offload.cls_u32->knode.link_handle = n->ht_down->handle; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; @@ -434,6 +522,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); + u32_remove_hw_knode(tp, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -454,6 +543,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { + u32_clear_hw_hnode(tp, ht); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -540,8 +630,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) if (ht == NULL) return 0; - if (TC_U32_KEY(ht->handle)) + if (TC_U32_KEY(ht->handle)) { + u32_remove_hw_knode(tp, ht->handle); return u32_delete_key(tp, (struct tc_u_knode *)ht); + } if (root_ht == ht) return -EINVAL; @@ -769,6 +861,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); + u32_replace_hw_knode(tp, new); return 0; } @@ -795,6 +888,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; + + u32_replace_hw_hnode(tp, ht); return 0; } @@ -877,7 +972,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); - + u32_replace_hw_knode(tp, n); *arg = (unsigned long)n; return 0; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index af1acf0..de1e176 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1841,7 +1841,7 @@ reclassify: return err; } - return -1; + return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= MAX_REC_LOOP)) { diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index ad70ecf..f9947d1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO}; unsigned int ntx; if (priv->qdiscs) { @@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) - dev->netdev_ops->ndo_setup_tc(dev, 0); + dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); else netdev_set_num_tc(dev, 0); } @@ -140,8 +141,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, + .tc = qopt->num_tc}; + priv->hw_owned = 1; - err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) goto err; } else { diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a338091..3aa4307 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) return msg; } -void sctp_datamsg_free(struct sctp_datamsg *msg) -{ - struct sctp_chunk *chunk; - - /* This doesn't have to be a _safe vairant because - * sctp_chunk_free() only drops the refs. - */ - list_for_each_entry(chunk, &msg->chunks, frag_list) - sctp_chunk_free(chunk); - - sctp_datamsg_put(msg); -} - /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { diff --git a/net/sctp/input.c b/net/sctp/input.c index 49d2cc7..21a2d6b 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -937,7 +937,6 @@ static struct sctp_association *__sctp_lookup_association( struct sctp_transport *t; struct sctp_association *asoc = NULL; - rcu_read_lock(); t = sctp_addrs_lookup_transport(net, local, peer); if (!t || !sctp_transport_hold(t)) goto out; @@ -949,7 +948,6 @@ static struct sctp_association *__sctp_lookup_association( sctp_transport_put(t); out: - rcu_read_unlock(); return asoc; } @@ -962,7 +960,9 @@ struct sctp_association *sctp_lookup_association(struct net *net, { struct sctp_association *asoc; + rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + rcu_read_unlock(); return asoc; } diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ded7d93..cfc3c71 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -161,7 +161,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa struct sctp_af *af; primary = &assoc->peer.primary_addr; - rcu_read_lock(); list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; @@ -172,7 +171,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa } af->seq_dump_addr(seq, addr); } - rcu_read_unlock(); } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e878da0..b89501e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6106,9 +6106,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, return retval; } -static void sctp_hash(struct sock *sk) +static int sctp_hash(struct sock *sk) { /* STUB */ + return 0; } static void sctp_unhash(struct sock *sk) diff --git a/net/tipc/link.c b/net/tipc/link.c index 347cdc9..e31d92f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -123,7 +123,6 @@ struct tipc_stats { struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; - struct tipc_media_addr *media_addr; struct net *net; /* Management and link supervision data */ @@ -904,8 +903,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) return link_schedule_user(l, list); } - if (unlikely(msg_size(hdr) > mtu)) + if (unlikely(msg_size(hdr) > mtu)) { + skb_queue_purge(list); return -EMSGSIZE; + } /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { @@ -917,8 +918,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (likely(skb_queue_len(transmq) < maxwin)) { _skb = skb_clone(skb, GFP_ATOMIC); - if (!_skb) + if (!_skb) { + skb_queue_purge(list); return -ENOBUFS; + } __skb_dequeue(list); __skb_queue_tail(transmq, skb); __skb_queue_tail(xmitq, _skb); @@ -1261,26 +1264,6 @@ drop: return rc; } -/* - * Send protocol message to the other endpoint. - */ -static void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, - int probe_msg, u32 gap, u32 tolerance, - u32 priority) -{ - struct sk_buff *skb = NULL; - struct sk_buff_head xmitq; - - __skb_queue_head_init(&xmitq); - tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap, - tolerance, priority, &xmitq); - skb = __skb_dequeue(&xmitq); - if (!skb) - return; - tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr); - l->rcv_unacked = 0; -} - static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) @@ -1479,6 +1462,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) l->tolerance = peers_tol; + if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI, + TIPC_MAX_LINK_PRI)) { + l->priority = peers_prio; + rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + } + l->silent_intv_cnt = 0; l->stats.recv_states++; if (msg_probe(hdr)) @@ -2023,16 +2012,18 @@ msg_full: return -EMSGSIZE; } -void tipc_link_set_tolerance(struct tipc_link *l, u32 tol) +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, + struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_proto_xmit(l, STATE_MSG, 0, 0, tol, 0); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq); } -void tipc_link_set_prio(struct tipc_link *l, u32 prio) +void tipc_link_set_prio(struct tipc_link *l, u32 prio, + struct sk_buff_head *xmitq) { l->priority = prio; - tipc_link_proto_xmit(l, STATE_MSG, 0, 0, 0, prio); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq); } void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) diff --git a/net/tipc/link.h b/net/tipc/link.h index b2ae0f4..b4ee9d6 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -112,8 +112,10 @@ char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); unsigned long tipc_link_tolerance(struct tipc_link *l); -void tipc_link_set_tolerance(struct tipc_link *l, u32 tol); -void tipc_link_set_prio(struct tipc_link *l, u32 prio); +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, + struct sk_buff_head *xmitq); +void tipc_link_set_prio(struct tipc_link *l, u32 prio, + struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 91fce70..777b979 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -418,6 +418,9 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, struct tipc_subscription *s) { struct sub_seq *sseq = nseq->sseqs; + struct tipc_name_seq ns; + + tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); list_add(&s->nameseq_list, &nseq->subscriptions); @@ -425,7 +428,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) { + if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; @@ -722,9 +725,10 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, void tipc_nametbl_subscribe(struct tipc_subscription *s) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); - u32 type = s->seq.type; + u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); int index = hash(type); struct name_seq *seq; + struct tipc_name_seq ns; spin_lock_bh(&tn->nametbl_lock); seq = nametbl_find_seq(s->net, type); @@ -735,8 +739,9 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s) tipc_nameseq_subscribe(seq, s); spin_unlock_bh(&seq->lock); } else { + tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); pr_warn("Failed to create subscription for {%u,%u,%u}\n", - s->seq.type, s->seq.lower, s->seq.upper); + ns.type, ns.lower, ns.upper); } spin_unlock_bh(&tn->nametbl_lock); } @@ -748,9 +753,10 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); struct name_seq *seq; + u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, s->seq.type); + seq = nametbl_find_seq(s->net, type); if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 2c016fd..de66d8f 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1104,7 +1104,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) req_nlh = (struct nlmsghdr *)skb->data; msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; - msg.dst_sk = info->dst_sk; msg.net = genl_info_net(info); if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { diff --git a/net/tipc/node.c b/net/tipc/node.c index 9d7a16f..9fcc2fb 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1166,7 +1166,7 @@ msg_full: * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) @@ -1174,33 +1174,43 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; - int bearer_id = -1; - int rc = -EHOSTUNREACH; + int bearer_id; + int rc; + + if (in_own_node(net, dnode)) { + tipc_sk_rcv(net, list); + return 0; + } - __skb_queue_head_init(&xmitq); n = tipc_node_find(net, dnode); - if (likely(n)) { - tipc_node_read_lock(n); - bearer_id = n->active_links[selector & 1]; - if (bearer_id >= 0) { - le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); - rc = tipc_link_xmit(le->link, list, &xmitq); - spin_unlock_bh(&le->lock); - } + if (unlikely(!n)) { + skb_queue_purge(list); + return -EHOSTUNREACH; + } + + tipc_node_read_lock(n); + bearer_id = n->active_links[selector & 1]; + if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); - if (likely(!rc)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - else if (rc == -ENOBUFS) - tipc_node_link_down(n, bearer_id, false); tipc_node_put(n); - return rc; + skb_queue_purge(list); + return -EHOSTUNREACH; } - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } + __skb_queue_head_init(&xmitq); + le = &n->links[bearer_id]; + spin_lock_bh(&le->lock); + rc = tipc_link_xmit(le->link, list, &xmitq); + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(n); + + if (likely(rc == 0)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + else if (rc == -ENOBUFS) + tipc_node_link_down(n, bearer_id, false); + + tipc_node_put(n); + return rc; } @@ -1637,9 +1647,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) char *name; struct tipc_link *link; struct tipc_node *node; + struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); + __skb_queue_head_init(&xmitq); + if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; @@ -1683,13 +1696,13 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - tipc_link_set_tolerance(link, tol); + tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); - tipc_link_set_prio(link, prio); + tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 win; @@ -1701,7 +1714,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) out: tipc_node_read_unlock(node); - + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); return res; } diff --git a/net/tipc/server.c b/net/tipc/server.c index 922e04a..2446bfb 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -571,13 +571,13 @@ static void tipc_work_stop(struct tipc_server *s) static int tipc_work_start(struct tipc_server *s) { - s->rcv_wq = alloc_workqueue("tipc_rcv", WQ_UNBOUND, 1); + s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); if (!s->rcv_wq) { pr_err("can't start tipc receive workqueue\n"); return -ENOMEM; } - s->send_wq = alloc_workqueue("tipc_send", WQ_UNBOUND, 1); + s->send_wq = alloc_ordered_workqueue("tipc_send", 0); if (!s->send_wq) { pr_err("can't start tipc send workqueue\n"); destroy_workqueue(s->rcv_wq); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 69ee2ee..22963ca 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -92,25 +92,42 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, +int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper) { - if (found_lower < sub->seq.lower) - found_lower = sub->seq.lower; - if (found_upper > sub->seq.upper) - found_upper = sub->seq.upper; + if (found_lower < seq->lower) + found_lower = seq->lower; + if (found_upper > seq->upper) + found_upper = seq->upper; if (found_lower > found_upper) return 0; return 1; } +u32 tipc_subscrp_convert_seq_type(u32 type, int swap) +{ + return htohl(type, swap); +} + +void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, + struct tipc_name_seq *out) +{ + out->type = htohl(in->type, swap); + out->lower = htohl(in->lower, swap); + out->upper = htohl(in->upper, swap); +} + void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, u32 node, int must) { - if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper)) + struct tipc_name_seq seq; + + tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); + if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; - if (!must && !(sub->filter & TIPC_SUB_PORTS)) + if (!must && + !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS)) return; tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, @@ -171,12 +188,14 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); /* Destroy any existing subscriptions for subscriber */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { - if (del_timer(&sub->timer)) { + timeout = htohl(sub->evt.s.timeout, sub->swap); + if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) { tipc_subscrp_delete(sub); tipc_subscrb_put(subscriber); } @@ -200,13 +219,16 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); /* Find first matching subscription, exit if not found */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - if (del_timer(&sub->timer)) { + timeout = htohl(sub->evt.s.timeout, sub->swap); + if ((timeout == TIPC_WAIT_FOREVER) || + del_timer(&sub->timer)) { tipc_subscrp_delete(sub); tipc_subscrb_put(subscriber); } @@ -216,66 +238,67 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, spin_unlock_bh(&subscriber->lock); } -static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, - struct tipc_subscription **sub_p) +static struct tipc_subscription *tipc_subscrp_create(struct net *net, + struct tipc_subscr *s, + int swap) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; - int swap; - - /* Determine subscriber's endianness */ - swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE)); - - /* Detect & process a subscription cancellation request */ - if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { - s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - tipc_subscrp_cancel(s, subscriber); - return 0; - } + u32 filter = htohl(s->filter, swap); /* Refuse subscription if global limit exceeded */ if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { pr_warn("Subscription rejected, limit reached (%u)\n", TIPC_MAX_SUBSCRIPTIONS); - return -EINVAL; + return NULL; } /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); - return -ENOMEM; + return NULL; } /* Initialize subscription object */ sub->net = net; - sub->seq.type = htohl(s->seq.type, swap); - sub->seq.lower = htohl(s->seq.lower, swap); - sub->seq.upper = htohl(s->seq.upper, swap); - sub->timeout = msecs_to_jiffies(htohl(s->timeout, swap)); - sub->filter = htohl(s->filter, swap); - if ((!(sub->filter & TIPC_SUB_PORTS) == - !(sub->filter & TIPC_SUB_SERVICE)) || - (sub->seq.lower > sub->seq.upper)) { + if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || + (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) { pr_warn("Subscription rejected, illegal request\n"); kfree(sub); - return -EINVAL; + return NULL; } - spin_lock_bh(&subscriber->lock); - list_add(&sub->subscrp_list, &subscriber->subscrp_list); - spin_unlock_bh(&subscriber->lock); - sub->subscriber = subscriber; + sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); + return sub; +} + +static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, int swap) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_subscription *sub = NULL; + u32 timeout; + + sub = tipc_subscrp_create(net, s, swap); + if (!sub) + return tipc_conn_terminate(tn->topsrv, subscriber->conid); + + spin_lock_bh(&subscriber->lock); + list_add(&sub->subscrp_list, &subscriber->subscrp_list); + tipc_subscrb_get(subscriber); + sub->subscriber = subscriber; + tipc_nametbl_subscribe(sub); + spin_unlock_bh(&subscriber->lock); + + timeout = htohl(sub->evt.s.timeout, swap); + if (timeout == TIPC_WAIT_FOREVER) + return; + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); - if (sub->timeout != TIPC_WAIT_FOREVER) - sub->timeout += jiffies; - if (!mod_timer(&sub->timer, sub->timeout)) - tipc_subscrb_get(subscriber); - *sub_p = sub; - return 0; + mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); } /* Handle one termination request for the subscriber */ @@ -289,14 +312,21 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, struct sockaddr_tipc *addr, void *usr_data, void *buf, size_t len) { - struct tipc_subscriber *subscrb = usr_data; - struct tipc_subscription *sub = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_subscriber *subscriber = usr_data; + struct tipc_subscr *s = (struct tipc_subscr *)buf; + int swap; + + /* Determine subscriber's endianness */ + swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | + TIPC_SUB_CANCEL)); - if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub)) - return tipc_conn_terminate(tn->topsrv, subscrb->conid); + /* Detect & process a subscription cancellation request */ + if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { + s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); + return tipc_subscrp_cancel(s, subscriber); + } - tipc_nametbl_subscribe(sub); + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 92ee18cc..be60103 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -50,21 +50,15 @@ struct tipc_subscriber; * @subscriber: pointer to its subscriber * @seq: name sequence associated with subscription * @net: point to network namespace - * @timeout: duration of subscription (in ms) - * @filter: event filtering to be done for subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list * @subscrp_list: adjacent subscriptions in subscriber's subscription list - * @server_ref: object reference of server port associated with subscription * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription */ struct tipc_subscription { struct tipc_subscriber *subscriber; - struct tipc_name_seq seq; struct net *net; - unsigned long timeout; - u32 filter; struct timer_list timer; struct list_head nameseq_list; struct list_head subscrp_list; @@ -72,11 +66,14 @@ struct tipc_subscription { struct tipc_event evt; }; -int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, +int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, u32 node, int must); +void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, + struct tipc_name_seq *out); +u32 tipc_subscrp_convert_seq_type(u32 type, int swap); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f75f847..8269da7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1534,7 +1534,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; unsigned char max_level = 0; - int unix_sock_count = 0; if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1542,11 +1541,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) { struct sock *sk = unix_get_socket(scm->fp->fp[i]); - if (sk) { - unix_sock_count++; + if (sk) max_level = max(max_level, unix_sk(sk)->recursion_level); - } } if (unlikely(max_level > MAX_RECURSION_LEVEL)) return -ETOOMANYREFS; |