summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
committerTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
commitfcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch)
tree22962a4387943edc841c72a4e636a068c66d58fd /net/core
downloadast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip
ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz
Initial import of modified Linux 2.6.28 tree
Original upstream URL: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile19
-rw-r--r--net/core/datagram.c651
-rw-r--r--net/core/dev.c4971
-rw-r--r--net/core/dev_mcast.c229
-rw-r--r--net/core/dst.c347
-rw-r--r--net/core/ethtool.c1042
-rw-r--r--net/core/fib_rules.c685
-rw-r--r--net/core/filter.c522
-rw-r--r--net/core/flow.c368
-rw-r--r--net/core/gen_estimator.c266
-rw-r--r--net/core/gen_stats.c241
-rw-r--r--net/core/iovec.c238
-rw-r--r--net/core/kmap_skb.h19
-rw-r--r--net/core/link_watch.c228
-rw-r--r--net/core/neighbour.c2831
-rw-r--r--net/core/net-sysfs.c529
-rw-r--r--net/core/net-sysfs.h8
-rw-r--r--net/core/net_namespace.c490
-rw-r--r--net/core/netevent.c70
-rw-r--r--net/core/netpoll.c852
-rw-r--r--net/core/pktgen.c3859
-rw-r--r--net/core/request_sock.c132
-rw-r--r--net/core/rtnetlink.c1429
-rw-r--r--net/core/scm.c314
-rw-r--r--net/core/skb_dma_map.c66
-rw-r--r--net/core/skbuff.c2665
-rw-r--r--net/core/sock.c2271
-rw-r--r--net/core/stream.c209
-rw-r--r--net/core/sysctl_net_core.c214
-rw-r--r--net/core/user_dma.c132
-rw-r--r--net/core/utils.c299
31 files changed, 26196 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
new file mode 100644
index 0000000..26a37cb
--- /dev/null
+++ b/net/core/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for the Linux networking core.
+#
+
+obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
+ gen_stats.o gen_estimator.o net_namespace.o
+
+obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
+obj-$(CONFIG_HAS_DMA) += skb_dma_map.o
+
+obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+
+obj-$(CONFIG_XFRM) += flow.o
+obj-y += net-sysfs.o
+obj-$(CONFIG_NET_PKTGEN) += pktgen.o
+obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
+obj-$(CONFIG_FIB_RULES) += fib_rules.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
new file mode 100644
index 0000000..ee63184
--- /dev/null
+++ b/net/core/datagram.c
@@ -0,0 +1,651 @@
+/*
+ * SUCS NET3:
+ *
+ * Generic datagram handling routines. These are generic for all
+ * protocols. Possibly a generic IP version on top of these would
+ * make sense. Not tonight however 8-).
+ * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
+ * NetROM layer all have identical poll code and mostly
+ * identical recvmsg() code. So we share it here. The poll was
+ * shared before but buried in udp.c so I moved it.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
+ * udp.c code)
+ *
+ * Fixes:
+ * Alan Cox : NULL return from skb_peek_copy()
+ * understood
+ * Alan Cox : Rewrote skb_read_datagram to avoid the
+ * skb_peek_copy stuff.
+ * Alan Cox : Added support for SOCK_SEQPACKET.
+ * IPX can no longer use the SO_TYPE hack
+ * but AX.25 now works right, and SPX is
+ * feasible.
+ * Alan Cox : Fixed write poll of non IP protocol
+ * crash.
+ * Florian La Roche: Changed for my new skbuff handling.
+ * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
+ * Linus Torvalds : BSD semantic fixes.
+ * Alan Cox : Datagram iovec handling
+ * Darryl Miles : Fixed non-blocking SOCK_STREAM.
+ * Alan Cox : POSIXisms
+ * Pete Wyckoff : Unconnected accept() fix.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Is a socket 'connection oriented' ?
+ */
+static inline int connection_based(struct sock *sk)
+{
+ return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
+}
+
+/*
+ * Wait for a packet..
+ */
+static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
+{
+ int error;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ /* Socket errors? */
+ error = sock_error(sk);
+ if (error)
+ goto out_err;
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ goto out;
+
+ /* Socket shut down? */
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out_noerr;
+
+ /* Sequenced packets can come disconnected.
+ * If so we report the problem
+ */
+ error = -ENOTCONN;
+ if (connection_based(sk) &&
+ !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
+ goto out_err;
+
+ /* handle signals */
+ if (signal_pending(current))
+ goto interrupted;
+
+ error = 0;
+ *timeo_p = schedule_timeout(*timeo_p);
+out:
+ finish_wait(sk->sk_sleep, &wait);
+ return error;
+interrupted:
+ error = sock_intr_errno(*timeo_p);
+out_err:
+ *err = error;
+ goto out;
+out_noerr:
+ *err = 0;
+ error = 1;
+ goto out;
+}
+
+/**
+ * __skb_recv_datagram - Receive a datagram skbuff
+ * @sk: socket
+ * @flags: MSG_ flags
+ * @peeked: returns non-zero if this packet has been seen before
+ * @err: error code returned
+ *
+ * Get a datagram skbuff, understands the peeking, nonblocking wakeups
+ * and possible races. This replaces identical code in packet, raw and
+ * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
+ * the long standing peek and read race for datagram sockets. If you
+ * alter this routine remember it must be re-entrant.
+ *
+ * This function will lock the socket if a skb is returned, so the caller
+ * needs to unlock the socket in that case (usually by calling
+ * skb_free_datagram)
+ *
+ * * It does not lock socket since today. This function is
+ * * free of race conditions. This measure should/can improve
+ * * significantly datagram socket latencies at high loads,
+ * * when data copying to user space takes lots of time.
+ * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ * * 8) Great win.)
+ * * --ANK (980729)
+ *
+ * The order of the tests when we find no data waiting are specified
+ * quite explicitly by POSIX 1003.1g, don't change them without having
+ * the standard around please.
+ */
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
+ int *peeked, int *err)
+{
+ struct sk_buff *skb;
+ long timeo;
+ /*
+ * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
+ */
+ int error = sock_error(sk);
+
+ if (error)
+ goto no_packet;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ /* Again only user level code calls this function, so nothing
+ * interrupt level will suddenly eat the receive_queue.
+ *
+ * Look at current nfs client by the way...
+ * However, this function was corrent in any case. 8)
+ */
+ unsigned long cpu_flags;
+
+ spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb) {
+ *peeked = skb->peeked;
+ if (flags & MSG_PEEK) {
+ skb->peeked = 1;
+ atomic_inc(&skb->users);
+ } else
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ }
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+
+ if (skb)
+ return skb;
+
+ /* User doesn't want to wait */
+ error = -EAGAIN;
+ if (!timeo)
+ goto no_packet;
+
+ } while (!wait_for_packet(sk, err, &timeo));
+
+ return NULL;
+
+no_packet:
+ *err = error;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_datagram);
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
+ int noblock, int *err)
+{
+ int peeked;
+
+ return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, err);
+}
+
+void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ sk_mem_reclaim(sk);
+}
+
+/**
+ * skb_kill_datagram - Free a datagram skbuff forcibly
+ * @sk: socket
+ * @skb: datagram skbuff
+ * @flags: MSG_ flags
+ *
+ * This function frees a datagram skbuff that was received by
+ * skb_recv_datagram. The flags argument must match the one
+ * used for skb_recv_datagram.
+ *
+ * If the MSG_PEEK flag is set, and the packet is still on the
+ * receive queue of the socket, it will be taken off the queue
+ * before it is freed.
+ *
+ * This function currently only disables BH when acquiring the
+ * sk_receive_queue lock. Therefore it must not be used in a
+ * context where that lock is acquired in an IRQ context.
+ *
+ * It returns 0 if the packet was removed by us.
+ */
+
+int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ err = 0;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ kfree_skb(skb);
+ sk_mem_reclaim(sk);
+ return err;
+}
+
+EXPORT_SYMBOL(skb_kill_datagram);
+
+/**
+ * skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: io vector to copy to
+ * @len: amount of data to copy from buffer to iovec
+ *
+ * Note: the iovec is modified during the copy.
+ */
+int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
+ struct iovec *to, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_toiovec(to, skb->data + offset, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ err = memcpy_toiovec(to, vaddr + frag->page_offset +
+ offset - start, copy);
+ kunmap(page);
+ if (err)
+ goto fault;
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_iovec(list,
+ offset - start,
+ to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+
+/**
+ * skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying to
+ * @from: io vector to copy to
+ * @len: amount of data to copy to buffer from iovec
+ *
+ * Returns 0 or -EFAULT.
+ * Note: the iovec is modified during the copy.
+ */
+int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
+ struct iovec *from, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_fromiovec(skb->data + offset, from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ err = memcpy_fromiovec(vaddr + frag->page_offset +
+ offset - start, from, copy);
+ kunmap(page);
+ if (err)
+ goto fault;
+
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_from_iovec(list,
+ offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+ u8 __user *to, int len,
+ __wsum *csump)
+{
+ int start = skb_headlen(skb);
+ int pos = 0;
+ int i, copy = start - offset;
+
+ /* Copy header. */
+ if (copy > 0) {
+ int err = 0;
+ if (copy > len)
+ copy = len;
+ *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
+ *csump, &err);
+ if (err)
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ int err = 0;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ csum2 = csum_and_copy_to_user(vaddr +
+ frag->page_offset +
+ offset - start,
+ to, copy, 0, &err);
+ kunmap(page);
+ if (err)
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list=list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2 = 0;
+ if (copy > len)
+ copy = len;
+ if (skb_copy_and_csum_datagram(list,
+ offset - start,
+ to, copy,
+ &csum2))
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ netdev_rx_csum_fault(skb->dev);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ return __skb_checksum_complete_head(skb, skb->len);
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
+/**
+ * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
+ * @skb: skbuff
+ * @hlen: hardware length
+ * @iov: io vector
+ *
+ * Caller _must_ check that skb will fit to this iovec.
+ *
+ * Returns: 0 - success.
+ * -EINVAL - checksum failure.
+ * -EFAULT - fault during copy. Beware, in this case iovec
+ * can be modified!
+ */
+int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
+ int hlen, struct iovec *iov)
+{
+ __wsum csum;
+ int chunk = skb->len - hlen;
+
+ if (!chunk)
+ return 0;
+
+ /* Skip filled elements.
+ * Pretty silly, look at memcpy_toiovec, though 8)
+ */
+ while (!iov->iov_len)
+ iov++;
+
+ if (iov->iov_len < chunk) {
+ if (__skb_checksum_complete(skb))
+ goto csum_error;
+ if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
+ goto fault;
+ } else {
+ csum = csum_partial(skb->data, hlen, skb->csum);
+ if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
+ chunk, &csum))
+ goto fault;
+ if (csum_fold(csum))
+ goto csum_error;
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ netdev_rx_csum_fault(skb->dev);
+ iov->iov_len -= chunk;
+ iov->iov_base += chunk;
+ }
+ return 0;
+csum_error:
+ return -EINVAL;
+fault:
+ return -EFAULT;
+}
+
+/**
+ * datagram_poll - generic datagram poll
+ * @file: file struct
+ * @sock: socket
+ * @wait: poll table
+ *
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you _don't_ use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ */
+unsigned int datagram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask;
+
+ poll_wait(file, sk->sk_sleep, wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (connection_based(sk)) {
+ if (sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* writable? */
+ if (sock_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ return mask;
+}
+
+EXPORT_SYMBOL(datagram_poll);
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_datagram_iovec);
+EXPORT_SYMBOL(skb_free_datagram);
+EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
new file mode 100644
index 0000000..9174c77
--- /dev/null
+++ b/net/core/dev.c
@@ -0,0 +1,4971 @@
+/*
+ * NET3 Protocol independent device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the non IP parts of dev.c 1.0.19
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Florian la Roche <rzsfl@rz.uni-sb.de>
+ * Alan Cox <gw4pts@gw4pts.ampr.org>
+ * David Hinds <dahinds@users.sourceforge.net>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Adam Sulmicki <adam@cfar.umd.edu>
+ * Pekka Riikonen <priikone@poesidon.pspt.fi>
+ *
+ * Changes:
+ * D.J. Barrow : Fixed bug where dev->refcnt gets set
+ * to 2 if register_netdev gets called
+ * before net_dev_init & also removed a
+ * few lines of code in the process.
+ * Alan Cox : device private ioctl copies fields back.
+ * Alan Cox : Transmit queue code does relevant
+ * stunts to keep the queue safe.
+ * Alan Cox : Fixed double lock.
+ * Alan Cox : Fixed promisc NULL pointer trap
+ * ???????? : Support the full private ioctl range
+ * Alan Cox : Moved ioctl permission check into
+ * drivers
+ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
+ * Alan Cox : 100 backlog just doesn't cut it when
+ * you start doing multicast video 8)
+ * Alan Cox : Rewrote net_bh and list manager.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Took out transmit every packet pass
+ * Saved a few bytes in the ioctl handler
+ * Alan Cox : Network driver sets packet type before
+ * calling netif_rx. Saves a function
+ * call a packet.
+ * Alan Cox : Hashed net_bh()
+ * Richard Kooijman: Timestamp fixes.
+ * Alan Cox : Wrong field in SIOCGIFDSTADDR
+ * Alan Cox : Device lock protection.
+ * Alan Cox : Fixed nasty side effect of device close
+ * changes.
+ * Rudi Cilibrasi : Pass the right thing to
+ * set_mac_address()
+ * Dave Miller : 32bit quantity for the device lock to
+ * make it work out on a Sparc.
+ * Bjorn Ekwall : Added KERNELD hack.
+ * Alan Cox : Cleaned up the backlog initialise.
+ * Craig Metz : SIOCGIFCONF fix if space for under
+ * 1 device.
+ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
+ * is no device open function.
+ * Andi Kleen : Fix error reporting for SIOCGIFCONF
+ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
+ * Cyrus Durgin : Cleaned for KMOD
+ * Adam Sulmicki : Bug Fix : Network Device Unload
+ * A network device unload needs to purge
+ * the backlog queue.
+ * Paul Rusty Russell : SIOCSIFNAME
+ * Pekka Riikonen : Netdev boot-time settings code
+ * Andrew Morton : Make unregister_netdevice wait
+ * indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
+ * - netif_rx() feedback
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/notifier.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/if_bridge.h>
+#include <linux/if_macvlan.h>
+#include <net/dst.h>
+#include <net/pkt_sched.h>
+#include <net/checksum.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/netpoll.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+#include <net/wext.h>
+#include <net/iw_handler.h>
+#include <asm/current.h>
+#include <linux/audit.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include "net-sysfs.h"
+
+/*
+ * The list of packet types we will receive (as opposed to discard)
+ * and the routines to invoke.
+ *
+ * Why 16. Because with 16 the only overlap we get on a hash of the
+ * low nibble of the protocol value is RARP/SNAP/X.25.
+ *
+ * NOTE: That is no longer true with the addition of VLAN tags. Not
+ * sure which should go first, but I bet it won't make much
+ * difference if we are running VLANs. The good news is that
+ * this protocol won't be in the list unless compiled in, so
+ * the average user (w/out VLANs) will not be adversely affected.
+ * --BLG
+ *
+ * 0800 IP
+ * 8100 802.1Q VLAN
+ * 0001 802.3
+ * 0002 AX.25
+ * 0004 802.2
+ * 8035 RARP
+ * 0005 SNAP
+ * 0805 X.25
+ * 0806 ARP
+ * 8137 IPX
+ * 0009 Localtalk
+ * 86DD IPv6
+ */
+
+#define PTYPE_HASH_SIZE (16)
+#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
+
+static DEFINE_SPINLOCK(ptype_lock);
+static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+static struct list_head ptype_all __read_mostly; /* Taps */
+
+#ifdef CONFIG_NET_DMA
+struct net_dma {
+ struct dma_client client;
+ spinlock_t lock;
+ cpumask_t channel_mask;
+ struct dma_chan **channels;
+};
+
+static enum dma_state_client
+netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+ enum dma_state state);
+
+static struct net_dma net_dma = {
+ .client = {
+ .event_callback = netdev_dma_event,
+ },
+};
+#endif
+
+/*
+ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
+ * semaphore.
+ *
+ * Pure readers hold dev_base_lock for reading.
+ *
+ * Writers must hold the rtnl semaphore while they loop through the
+ * dev_base_head list, and hold dev_base_lock for writing when they do the
+ * actual updates. This allows pure readers to access the list even
+ * while a writer is preparing to update it.
+ *
+ * To put it another way, dev_base_lock is held for writing only to
+ * protect against pure readers; the rtnl semaphore provides the
+ * protection against other writers.
+ *
+ * See, for example usages, register_netdevice() and
+ * unregister_netdevice(), which must be called with the rtnl
+ * semaphore held.
+ */
+DEFINE_RWLOCK(dev_base_lock);
+
+EXPORT_SYMBOL(dev_base_lock);
+
+#define NETDEV_HASHBITS 8
+#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+ unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+ return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+ return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
+/* Device list insertion */
+static int list_netdevice(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&dev_base_lock);
+ list_add_tail(&dev->dev_list, &net->dev_base_head);
+ hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
+ hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
+ write_unlock_bh(&dev_base_lock);
+ return 0;
+}
+
+/* Device list removal */
+static void unlist_netdevice(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ /* Unlink dev from the device chain */
+ write_lock_bh(&dev_base_lock);
+ list_del(&dev->dev_list);
+ hlist_del(&dev->name_hlist);
+ hlist_del(&dev->index_hlist);
+ write_unlock_bh(&dev_base_lock);
+}
+
+/*
+ * Our notifier list
+ */
+
+static RAW_NOTIFIER_HEAD(netdev_chain);
+
+/*
+ * Device drivers call our routines to queue packets here. We empty the
+ * queue in the local softnet handler.
+ */
+
+DEFINE_PER_CPU(struct softnet_data, softnet_data);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] =
+ {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
+ ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
+ ARPHRD_NONE};
+
+static const char *netdev_lock_name[] =
+ {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
+ "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
+ "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
+
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+ int i;
+
+ i = netdev_lock_pos(dev->type);
+ lockdep_set_class_and_name(&dev->addr_list_lock,
+ &netdev_addr_lock_key[i],
+ netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
+#endif
+
+/*******************************************************************************
+
+ Protocol management and registration routines
+
+*******************************************************************************/
+
+/*
+ * Add a protocol ID to the list. Now that the input handler is
+ * smarter we can dispense with all the messy stuff that used to be
+ * here.
+ *
+ * BEWARE!!! Protocol handlers, mangling input packets,
+ * MUST BE last in hash buckets and checking protocol handlers
+ * MUST start from promiscuous ptype_all chain in net_bh.
+ * It is true now, do not change it.
+ * Explanation follows: if protocol handler, mangling packet, will
+ * be the first on list, it is not able to sense, that packet
+ * is cloned and should be copied-on-write, so that it will
+ * change it and subsequent readers will get broken packet.
+ * --ANK (980803)
+ */
+
+/**
+ * dev_add_pack - add packet handler
+ * @pt: packet type declaration
+ *
+ * Add a protocol handler to the networking stack. The passed &packet_type
+ * is linked into kernel lists and may not be freed until it has been
+ * removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new packet type (until the next received packet).
+ */
+
+void dev_add_pack(struct packet_type *pt)
+{
+ int hash;
+
+ spin_lock_bh(&ptype_lock);
+ if (pt->type == htons(ETH_P_ALL))
+ list_add_rcu(&pt->list, &ptype_all);
+ else {
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+ list_add_rcu(&pt->list, &ptype_base[hash]);
+ }
+ spin_unlock_bh(&ptype_lock);
+}
+
+/**
+ * __dev_remove_pack - remove packet handler
+ * @pt: packet type declaration
+ *
+ * Remove a protocol handler that was previously added to the kernel
+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ * from the kernel lists and can be freed or reused once this function
+ * returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+void __dev_remove_pack(struct packet_type *pt)
+{
+ struct list_head *head;
+ struct packet_type *pt1;
+
+ spin_lock_bh(&ptype_lock);
+
+ if (pt->type == htons(ETH_P_ALL))
+ head = &ptype_all;
+ else
+ head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+
+ list_for_each_entry(pt1, head, list) {
+ if (pt == pt1) {
+ list_del_rcu(&pt->list);
+ goto out;
+ }
+ }
+
+ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
+out:
+ spin_unlock_bh(&ptype_lock);
+}
+/**
+ * dev_remove_pack - remove packet handler
+ * @pt: packet type declaration
+ *
+ * Remove a protocol handler that was previously added to the kernel
+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ * from the kernel lists and can be freed or reused once this function
+ * returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_pack(struct packet_type *pt)
+{
+ __dev_remove_pack(pt);
+
+ synchronize_net();
+}
+
+/******************************************************************************
+
+ Device Boot-time Settings Routines
+
+*******************************************************************************/
+
+/* Boot time configuration table */
+static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+
+/**
+ * netdev_boot_setup_add - add new setup entry
+ * @name: name of the device
+ * @map: configured settings for the device
+ *
+ * Adds new setup entry to the dev_boot_setup list. The function
+ * returns 0 on error and 1 on success. This is a generic routine to
+ * all netdevices.
+ */
+static int netdev_boot_setup_add(char *name, struct ifmap *map)
+{
+ struct netdev_boot_setup *s;
+ int i;
+
+ s = dev_boot_setup;
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
+ memset(s[i].name, 0, sizeof(s[i].name));
+ strlcpy(s[i].name, name, IFNAMSIZ);
+ memcpy(&s[i].map, map, sizeof(s[i].map));
+ break;
+ }
+ }
+
+ return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+}
+
+/**
+ * netdev_boot_setup_check - check boot time settings
+ * @dev: the netdevice
+ *
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
+ */
+int netdev_boot_setup_check(struct net_device *dev)
+{
+ struct netdev_boot_setup *s = dev_boot_setup;
+ int i;
+
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
+ !strcmp(dev->name, s[i].name)) {
+ dev->irq = s[i].map.irq;
+ dev->base_addr = s[i].map.base_addr;
+ dev->mem_start = s[i].map.mem_start;
+ dev->mem_end = s[i].map.mem_end;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+/**
+ * netdev_boot_base - get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
+ *
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
+ */
+unsigned long netdev_boot_base(const char *prefix, int unit)
+{
+ const struct netdev_boot_setup *s = dev_boot_setup;
+ char name[IFNAMSIZ];
+ int i;
+
+ sprintf(name, "%s%d", prefix, unit);
+
+ /*
+ * If device already registered then return base of 1
+ * to indicate not to probe for this interface
+ */
+ if (__dev_get_by_name(&init_net, name))
+ return 1;
+
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
+ if (!strcmp(name, s[i].name))
+ return s[i].map.base_addr;
+ return 0;
+}
+
+/*
+ * Saves at boot time configured settings for any netdevice.
+ */
+int __init netdev_boot_setup(char *str)
+{
+ int ints[5];
+ struct ifmap map;
+
+ str = get_options(str, ARRAY_SIZE(ints), ints);
+ if (!str || !*str)
+ return 0;
+
+ /* Save settings */
+ memset(&map, 0, sizeof(map));
+ if (ints[0] > 0)
+ map.irq = ints[1];
+ if (ints[0] > 1)
+ map.base_addr = ints[2];
+ if (ints[0] > 2)
+ map.mem_start = ints[3];
+ if (ints[0] > 3)
+ map.mem_end = ints[4];
+
+ /* Add new entry to the list */
+ return netdev_boot_setup_add(str, &map);
+}
+
+__setup("netdev=", netdev_boot_setup);
+
+/*******************************************************************************
+
+ Device Interface Subroutines
+
+*******************************************************************************/
+
+/**
+ * __dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name. Must be called under RTNL semaphore
+ * or @dev_base_lock. If the name is found a pointer to the device
+ * is returned. If the name is not found then %NULL is returned. The
+ * reference counters are not incremented so the caller must be
+ * careful with locks.
+ */
+
+struct net_device *__dev_get_by_name(struct net *net, const char *name)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, dev_name_hash(net, name)) {
+ struct net_device *dev
+ = hlist_entry(p, struct net_device, name_hlist);
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+ return dev;
+ }
+ return NULL;
+}
+
+/**
+ * dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name. This can be called from any
+ * context and does its own locking. The returned handle has
+ * the usage count incremented and the caller must use dev_put() to
+ * release it when it is no longer needed. %NULL is returned if no
+ * matching device is found.
+ */
+
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_name(net, name);
+ if (dev)
+ dev_hold(dev);
+ read_unlock(&dev_base_lock);
+ return dev;
+}
+
+/**
+ * __dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold either the RTNL semaphore
+ * or @dev_base_lock.
+ */
+
+struct net_device *__dev_get_by_index(struct net *net, int ifindex)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, dev_index_hash(net, ifindex)) {
+ struct net_device *dev
+ = hlist_entry(p, struct net_device, index_hlist);
+ if (dev->ifindex == ifindex)
+ return dev;
+ }
+ return NULL;
+}
+
+
+/**
+ * dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns NULL if the device
+ * is not found or a pointer to the device. The device returned has
+ * had a reference added and the pointer is safe until the user calls
+ * dev_put to indicate they have finished with it.
+ */
+
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_index(net, ifindex);
+ if (dev)
+ dev_hold(dev);
+ read_unlock(&dev_base_lock);
+ return dev;
+}
+
+/**
+ * dev_getbyhwaddr - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Search for an interface by MAC address. Returns NULL if the device
+ * is not found or a pointer to the device. The caller must hold the
+ * rtnl semaphore. The returned device has not had its ref count increased
+ * and the caller must therefore be careful about locking
+ *
+ * BUGS:
+ * If the API was consistent this would be __dev_get_by_hwaddr
+ */
+
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ for_each_netdev(net, dev)
+ if (dev->type == type &&
+ !memcmp(dev->dev_addr, ha, dev->addr_len))
+ return dev;
+
+ return NULL;
+}
+
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
+struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev->type == type)
+ return dev;
+
+ return NULL;
+}
+
+EXPORT_SYMBOL(__dev_getfirstbyhwtype);
+
+struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+ dev = __dev_getfirstbyhwtype(net, type);
+ if (dev)
+ dev_hold(dev);
+ rtnl_unlock();
+ return dev;
+}
+
+EXPORT_SYMBOL(dev_getfirstbyhwtype);
+
+/**
+ * dev_get_by_flags - find any device with given flags
+ * @net: the applicable net namespace
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
+ *
+ * Search for any interface with the given flags. Returns NULL if a device
+ * is not found or a pointer to the device. The device returned has
+ * had a reference added and the pointer is safe until the user calls
+ * dev_put to indicate they have finished with it.
+ */
+
+struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
+{
+ struct net_device *dev, *ret;
+
+ ret = NULL;
+ read_lock(&dev_base_lock);
+ for_each_netdev(net, dev) {
+ if (((dev->flags ^ if_flags) & mask) == 0) {
+ dev_hold(dev);
+ ret = dev;
+ break;
+ }
+ }
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+
+/**
+ * dev_valid_name - check if name is okay for network device
+ * @name: name string
+ *
+ * Network device names need to be valid file names to
+ * to allow sysfs to work. We also disallow any kind of
+ * whitespace.
+ */
+int dev_valid_name(const char *name)
+{
+ if (*name == '\0')
+ return 0;
+ if (strlen(name) >= IFNAMSIZ)
+ return 0;
+ if (!strcmp(name, ".") || !strcmp(name, ".."))
+ return 0;
+
+ while (*name) {
+ if (*name == '/' || isspace(*name))
+ return 0;
+ name++;
+ }
+ return 1;
+}
+
+/**
+ * __dev_alloc_name - allocate a name for a device
+ * @net: network namespace to allocate the device name in
+ * @name: name format string
+ * @buf: scratch buffer and result name string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+{
+ int i = 0;
+ const char *p;
+ const int max_netdevices = 8*PAGE_SIZE;
+ unsigned long *inuse;
+ struct net_device *d;
+
+ p = strnchr(name, IFNAMSIZ-1, '%');
+ if (p) {
+ /*
+ * Verify the string as this thing may have come from
+ * the user. There must be either one "%d" and no other "%"
+ * characters.
+ */
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
+ /* Use one page as a bit array of possible slots */
+ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
+
+ for_each_netdev(net, d) {
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
+
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, max_netdevices);
+ free_page((unsigned long) inuse);
+ }
+
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!__dev_get_by_name(net, buf))
+ return i;
+
+ /* It is possible to run out of possible slots
+ * when the name is long and there isn't enough space left
+ * for the digits, or if all bits are used.
+ */
+ return -ENFILE;
+}
+
+/**
+ * dev_alloc_name - allocate a name for a device
+ * @dev: device
+ * @name: name format string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+int dev_alloc_name(struct net_device *dev, const char *name)
+{
+ char buf[IFNAMSIZ];
+ struct net *net;
+ int ret;
+
+ BUG_ON(!dev_net(dev));
+ net = dev_net(dev);
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+
+
+/**
+ * dev_change_name - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ char oldname[IFNAMSIZ];
+ int err = 0;
+ int ret;
+ struct net *net;
+
+ ASSERT_RTNL();
+ BUG_ON(!dev_net(dev));
+
+ net = dev_net(dev);
+ if (dev->flags & IFF_UP)
+ return -EBUSY;
+
+ if (!dev_valid_name(newname))
+ return -EINVAL;
+
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+ return 0;
+
+ memcpy(oldname, dev->name, IFNAMSIZ);
+
+ if (strchr(newname, '%')) {
+ err = dev_alloc_name(dev, newname);
+ if (err < 0)
+ return err;
+ }
+ else if (__dev_get_by_name(net, newname))
+ return -EEXIST;
+ else
+ strlcpy(dev->name, newname, IFNAMSIZ);
+
+rollback:
+ ret = device_rename(&dev->dev, dev->name);
+ if (ret) {
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ return ret;
+ }
+
+ write_lock_bh(&dev_base_lock);
+ hlist_del(&dev->name_hlist);
+ hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
+ write_unlock_bh(&dev_base_lock);
+
+ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
+ ret = notifier_to_errno(ret);
+
+ if (ret) {
+ if (err) {
+ printk(KERN_ERR
+ "%s: name change rollback failed: %d.\n",
+ dev->name, ret);
+ } else {
+ err = ret;
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ goto rollback;
+ }
+ }
+
+ return err;
+}
+
+/**
+ * dev_set_alias - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device,
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ ASSERT_RTNL();
+
+ if (len >= IFALIASZ)
+ return -EINVAL;
+
+ if (!len) {
+ if (dev->ifalias) {
+ kfree(dev->ifalias);
+ dev->ifalias = NULL;
+ }
+ return 0;
+ }
+
+ dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
+ if (!dev->ifalias)
+ return -ENOMEM;
+
+ strlcpy(dev->ifalias, alias, len+1);
+ return len;
+}
+
+
+/**
+ * netdev_features_change - device changes features
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed features.
+ */
+void netdev_features_change(struct net_device *dev)
+{
+ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL(netdev_features_change);
+
+/**
+ * netdev_state_change - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifiers(NETDEV_CHANGE, dev);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ }
+}
+
+void netdev_bonding_change(struct net_device *dev)
+{
+ call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
+}
+EXPORT_SYMBOL(netdev_bonding_change);
+
+/**
+ * dev_load - load a network module
+ * @net: the applicable net namespace
+ * @name: name of interface
+ *
+ * If a network interface is not present and the process has suitable
+ * privileges this function loads the module. If module loading is not
+ * available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_name(net, name);
+ read_unlock(&dev_base_lock);
+
+ if (!dev && capable(CAP_SYS_MODULE))
+ request_module("%s", name);
+}
+
+/**
+ * dev_open - prepare an interface for use.
+ * @dev: device to open
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ */
+int dev_open(struct net_device *dev)
+{
+ int ret = 0;
+
+ ASSERT_RTNL();
+
+ /*
+ * Is it already up?
+ */
+
+ if (dev->flags & IFF_UP)
+ return 0;
+
+ /*
+ * Is it even present?
+ */
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ /*
+ * Call device private open method
+ */
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ if (dev->validate_addr)
+ ret = dev->validate_addr(dev);
+
+ if (!ret && dev->open)
+ ret = dev->open(dev);
+
+ /*
+ * If it went open OK then:
+ */
+
+ if (ret)
+ clear_bit(__LINK_STATE_START, &dev->state);
+ else {
+ /*
+ * Set the flags.
+ */
+ dev->flags |= IFF_UP;
+
+ /*
+ * Initialize multicasting status
+ */
+ dev_set_rx_mode(dev);
+
+ /*
+ * Wakeup transmit queue engine
+ */
+ dev_activate(dev);
+
+ /*
+ * ... and announce new interface.
+ */
+ call_netdevice_notifiers(NETDEV_UP, dev);
+ }
+
+ return ret;
+}
+
+/**
+ * dev_close - shutdown an interface.
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+int dev_close(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ might_sleep();
+
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ /*
+ * Tell people we are going down, so that they can
+ * prepare to death, when device is still operating.
+ */
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+
+ clear_bit(__LINK_STATE_START, &dev->state);
+
+ /* Synchronize to scheduled poll. We cannot touch poll list,
+ * it can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
+ */
+ smp_mb__after_clear_bit(); /* Commit netif_running(). */
+
+ dev_deactivate(dev);
+
+ /*
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
+ */
+ if (dev->stop)
+ dev->stop(dev);
+
+ /*
+ * Device is now down.
+ */
+
+ dev->flags &= ~IFF_UP;
+
+ /*
+ * Tell people we are down
+ */
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+
+ return 0;
+}
+
+
+/**
+ * dev_disable_lro - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
+ dev->ethtool_ops->set_flags) {
+ u32 flags = dev->ethtool_ops->get_flags(dev);
+ if (flags & ETH_FLAG_LRO) {
+ flags &= ~ETH_FLAG_LRO;
+ dev->ethtool_ops->set_flags(dev, flags);
+ }
+ }
+ WARN_ON(dev->features & NETIF_F_LRO);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+
+static int dev_boot_phase = 1;
+
+/*
+ * Device change register/unregister. These are not inline or static
+ * as we export them to the world.
+ */
+
+/**
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier(struct notifier_block *nb)
+{
+ struct net_device *dev;
+ struct net_device *last;
+ struct net *net;
+ int err;
+
+ rtnl_lock();
+ err = raw_notifier_chain_register(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+ if (dev_boot_phase)
+ goto unlock;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ goto rollback;
+
+ if (!(dev->flags & IFF_UP))
+ continue;
+
+ nb->notifier_call(nb, NETDEV_UP, dev);
+ }
+ }
+
+unlock:
+ rtnl_unlock();
+ return err;
+
+rollback:
+ last = dev;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev == last)
+ break;
+
+ if (dev->flags & IFF_UP) {
+ nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
+ nb->notifier_call(nb, NETDEV_DOWN, dev);
+ }
+ nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+
+ raw_notifier_chain_unregister(&netdev_chain, nb);
+ goto unlock;
+}
+
+/**
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ */
+
+int unregister_netdevice_notifier(struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = raw_notifier_chain_unregister(&netdev_chain, nb);
+ rtnl_unlock();
+ return err;
+}
+
+/**
+ * call_netdevice_notifiers - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
+{
+ return raw_notifier_call_chain(&netdev_chain, val, dev);
+}
+
+/* When > 0 there are consumers of rx skb time stamps */
+static atomic_t netstamp_needed = ATOMIC_INIT(0);
+
+void net_enable_timestamp(void)
+{
+ atomic_inc(&netstamp_needed);
+}
+
+void net_disable_timestamp(void)
+{
+ atomic_dec(&netstamp_needed);
+}
+
+static inline void net_timestamp(struct sk_buff *skb)
+{
+ if (atomic_read(&netstamp_needed))
+ __net_timestamp(skb);
+ else
+ skb->tstamp.tv64 = 0;
+}
+
+/*
+ * Support routine. Sends outgoing frames to any network
+ * taps currently in use.
+ */
+
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct packet_type *ptype;
+
+ net_timestamp(skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ /* Never send packets back to the socket
+ * they originated from - MvS (miquels@drinkel.ow.org)
+ */
+ if ((ptype->dev == dev || !ptype->dev) &&
+ (ptype->af_packet_priv == NULL ||
+ (struct sock *)ptype->af_packet_priv != skb->sk)) {
+ struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ break;
+
+ /* skb->nh should be correctly
+ set by sender, so that the second statement is
+ just protection against buggy protocols.
+ */
+ skb_reset_mac_header(skb2);
+
+ if (skb_network_header(skb2) < skb2->data ||
+ skb2->network_header > skb2->tail) {
+ if (net_ratelimit())
+ printk(KERN_CRIT "protocol %04x is "
+ "buggy, dev %s\n",
+ skb2->protocol, dev->name);
+ skb_reset_network_header(skb2);
+ }
+
+ skb2->transport_header = skb2->network_header;
+ skb2->pkt_type = PACKET_OUTGOING;
+ ptype->func(skb2, skb->dev, ptype, skb->dev);
+ }
+ }
+ rcu_read_unlock();
+}
+
+
+static inline void __netif_reschedule(struct Qdisc *q)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sd = &__get_cpu_var(softnet_data);
+ q->next_sched = sd->output_queue;
+ sd->output_queue = q;
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+ __netif_reschedule(q);
+}
+EXPORT_SYMBOL(__netif_schedule);
+
+void dev_kfree_skb_irq(struct sk_buff *skb)
+{
+ if (atomic_dec_and_test(&skb->users)) {
+ struct softnet_data *sd;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sd = &__get_cpu_var(softnet_data);
+ skb->next = sd->completion_queue;
+ sd->completion_queue = skb;
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(dev_kfree_skb_irq);
+
+void dev_kfree_skb_any(struct sk_buff *skb)
+{
+ if (in_irq() || irqs_disabled())
+ dev_kfree_skb_irq(skb);
+ else
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(dev_kfree_skb_any);
+
+
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
+void netif_device_detach(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_stop_queue(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_detach);
+
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
+void netif_device_attach(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_wake_queue(dev);
+ __netdev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_attach);
+
+static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+{
+ return ((features & NETIF_F_GEN_CSUM) ||
+ ((features & NETIF_F_IP_CSUM) &&
+ protocol == htons(ETH_P_IP)) ||
+ ((features & NETIF_F_IPV6_CSUM) &&
+ protocol == htons(ETH_P_IPV6)));
+}
+
+static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
+{
+ if (can_checksum_protocol(dev->features, skb->protocol))
+ return true;
+
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ if (can_checksum_protocol(dev->features & dev->vlan_features,
+ veh->h_vlan_encapsulated_proto))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Invalidate hardware checksum when packet is to be mangled, and
+ * complete checksum manually on outgoing path.
+ */
+int skb_checksum_help(struct sk_buff *skb)
+{
+ __wsum csum;
+ int ret = 0, offset;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ goto out_set_summed;
+
+ if (unlikely(skb_shinfo(skb)->gso_size)) {
+ /* Let GSO fix up the checksum. */
+ goto out_set_summed;
+ }
+
+ offset = skb->csum_start - skb_headroom(skb);
+ BUG_ON(offset >= skb_headlen(skb));
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ offset += skb->csum_offset;
+ BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(__sum16))) {
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (ret)
+ goto out;
+ }
+
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+out_set_summed:
+ skb->ip_summed = CHECKSUM_NONE;
+out:
+ return ret;
+}
+
+/**
+ * skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_type *ptype;
+ __be16 type = skb->protocol;
+ int err;
+
+ BUG_ON(skb_shinfo(skb)->frag_list);
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = skb->network_header - skb->mac_header;
+ __skb_pull(skb, skb->mac_len);
+
+ if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ if (skb_header_cloned(skb) &&
+ (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return ERR_PTR(err);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype,
+ &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
+ if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ err = ptype->gso_send_check(skb);
+ segs = ERR_PTR(err);
+ if (err || skb_gso_ok(skb, features))
+ break;
+ __skb_push(skb, (skb->data -
+ skb_network_header(skb)));
+ }
+ segs = ptype->gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+void netdev_rx_csum_fault(struct net_device *dev)
+{
+ if (net_ratelimit()) {
+ printk(KERN_ERR "%s: hw csum failure.\n",
+ dev ? dev->name : "<unknown>");
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(netdev_rx_csum_fault);
+#endif
+
+/* Actually, we should eliminate this check as soon as we know, that:
+ * 1. IOMMU is present and allows to map all the memory.
+ * 2. No high memory really exists on this machine.
+ */
+
+static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_HIGHMEM
+ int i;
+
+ if (dev->features & NETIF_F_HIGHDMA)
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ if (PageHighMem(skb_shinfo(skb)->frags[i].page))
+ return 1;
+
+#endif
+ return 0;
+}
+
+struct dev_gso_cb {
+ void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+ struct dev_gso_cb *cb;
+
+ do {
+ struct sk_buff *nskb = skb->next;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+ kfree_skb(nskb);
+ } while (skb->next);
+
+ cb = DEV_GSO_CB(skb);
+ if (cb->destructor)
+ cb->destructor(skb);
+}
+
+/**
+ * dev_gso_segment - Perform emulated hardware segmentation on skb.
+ * @skb: buffer to segment
+ *
+ * This function segments the given skb and stores the list of segments
+ * in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *segs;
+ int features = dev->features & ~(illegal_highdma(dev, skb) ?
+ NETIF_F_SG : 0);
+
+ segs = skb_gso_segment(skb, features);
+
+ /* Verifying header integrity only. */
+ if (!segs)
+ return 0;
+
+ if (IS_ERR(segs))
+ return PTR_ERR(segs);
+
+ skb->next = segs;
+ DEV_GSO_CB(skb)->destructor = skb->destructor;
+ skb->destructor = dev_gso_skb_destructor;
+
+ return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ if (likely(!skb->next)) {
+ if (!list_empty(&ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
+ if (netif_needs_gso(dev, skb)) {
+ if (unlikely(dev_gso_segment(skb)))
+ goto out_kfree_skb;
+ if (skb->next)
+ goto gso;
+ }
+
+ return dev->hard_start_xmit(skb, dev);
+ }
+
+gso:
+ do {
+ struct sk_buff *nskb = skb->next;
+ int rc;
+
+ skb->next = nskb->next;
+ nskb->next = NULL;
+ rc = dev->hard_start_xmit(nskb, dev);
+ if (unlikely(rc)) {
+ nskb->next = skb->next;
+ skb->next = nskb;
+ return rc;
+ }
+ if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
+ return NETDEV_TX_BUSY;
+ } while (skb->next);
+
+ skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+ kfree_skb(skb);
+ return 0;
+}
+
+static u32 simple_tx_hashrnd;
+static int simple_tx_hashrnd_initialized = 0;
+
+static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
+{
+ u32 addr1, addr2, ports;
+ u32 hash, ihl;
+ u8 ip_proto = 0;
+
+ if (unlikely(!simple_tx_hashrnd_initialized)) {
+ get_random_bytes(&simple_tx_hashrnd, 4);
+ simple_tx_hashrnd_initialized = 1;
+ }
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+ ip_proto = ip_hdr(skb)->protocol;
+ addr1 = ip_hdr(skb)->saddr;
+ addr2 = ip_hdr(skb)->daddr;
+ ihl = ip_hdr(skb)->ihl;
+ break;
+ case htons(ETH_P_IPV6):
+ ip_proto = ipv6_hdr(skb)->nexthdr;
+ addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+ addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+ ihl = (40 >> 2);
+ break;
+ default:
+ return 0;
+ }
+
+
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
+ break;
+
+ default:
+ ports = 0;
+ break;
+ }
+
+ hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
+
+ return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static struct netdev_queue *dev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ u16 queue_index = 0;
+
+ if (dev->select_queue)
+ queue_index = dev->select_queue(dev, skb);
+ else if (dev->real_num_tx_queues > 1)
+ queue_index = simple_tx_hash(dev, skb);
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
+
+/**
+ * dev_queue_xmit - transmit a buffer
+ * @skb: buffer to transmit
+ *
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
+ *
+ * A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ *
+ * -----------------------------------------------------------------------------------
+ * I notice this method can also return errors from the queue disciplines,
+ * including NET_XMIT_DROP, which is a positive value. So, errors can also
+ * be positive.
+ *
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
+ *
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
+ * --BLG
+ */
+int dev_queue_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ struct Qdisc *q;
+ int rc = -ENOMEM;
+
+ /* GSO will handle the following emulations directly. */
+ if (netif_needs_gso(dev, skb))
+ goto gso;
+
+ if (skb_shinfo(skb)->frag_list &&
+ !(dev->features & NETIF_F_FRAGLIST) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* Fragmented skb is linearized if device does not support SG,
+ * or if at least one of fragments is in highmem and device
+ * does not support DMA from it.
+ */
+ if (skb_shinfo(skb)->nr_frags &&
+ (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* If packet is not checksummed and device does not support
+ * checksumming for this protocol, complete checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ skb_set_transport_header(skb, skb->csum_start -
+ skb_headroom(skb));
+ if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
+ goto out_kfree_skb;
+ }
+
+gso:
+ /* Disable soft irqs for various locks below. Also
+ * stops preemption for RCU.
+ */
+ rcu_read_lock_bh();
+
+ txq = dev_pick_tx(dev, skb);
+ q = rcu_dereference(txq->qdisc);
+
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
+#endif
+ if (q->enqueue) {
+ spinlock_t *root_lock = qdisc_lock(q);
+
+ spin_lock(root_lock);
+
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ kfree_skb(skb);
+ rc = NET_XMIT_DROP;
+ } else {
+ rc = qdisc_enqueue_root(skb, q);
+ qdisc_run(q);
+ }
+ spin_unlock(root_lock);
+
+ goto out;
+ }
+
+ /* The device has no queue. Common case for software devices:
+ loopback, all the sorts of tunnels...
+
+ Really, it is unlikely that netif_tx_lock protection is necessary
+ here. (f.e. loopback and IP tunnels are clean ignoring statistics
+ counters.)
+ However, it is possible, that they rely on protection
+ made by us here.
+
+ Check this and shot the lock. It is not prone from deadlocks.
+ Either shot noqueue qdisc, it is even simpler 8)
+ */
+ if (dev->flags & IFF_UP) {
+ int cpu = smp_processor_id(); /* ok because BHs are off */
+
+ if (txq->xmit_lock_owner != cpu) {
+
+ HARD_TX_LOCK(dev, txq, cpu);
+
+ if (!netif_tx_queue_stopped(txq)) {
+ rc = 0;
+ if (!dev_hard_start_xmit(skb, dev, txq)) {
+ HARD_TX_UNLOCK(dev, txq);
+ goto out;
+ }
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ if (net_ratelimit())
+ printk(KERN_CRIT "Virtual device %s asks to "
+ "queue packet!\n", dev->name);
+ } else {
+ /* Recursion is detected! It is possible,
+ * unfortunately */
+ if (net_ratelimit())
+ printk(KERN_CRIT "Dead loop on virtual device "
+ "%s, fix it urgently!\n", dev->name);
+ }
+ }
+
+ rc = -ENETDOWN;
+ rcu_read_unlock_bh();
+
+out_kfree_skb:
+ kfree_skb(skb);
+ return rc;
+out:
+ rcu_read_unlock_bh();
+ return rc;
+}
+
+
+/*=======================================================================
+ Receiver routines
+ =======================================================================*/
+
+int netdev_max_backlog __read_mostly = 1000;
+int netdev_budget __read_mostly = 300;
+int weight_p __read_mostly = 64; /* old backlog weight */
+
+DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
+
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process. It always succeeds. The buffer
+ * may be dropped during processing for congestion control or by the
+ * protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+ struct softnet_data *queue;
+ unsigned long flags;
+
+ /* if netpoll wants it, pretend we never saw it */
+ if (netpoll_rx(skb))
+ return NET_RX_DROP;
+
+ if (!skb->tstamp.tv64)
+ net_timestamp(skb);
+
+ /*
+ * The code is rearranged so that the path is the most
+ * short when CPU is congested, but is still operating.
+ */
+ local_irq_save(flags);
+ queue = &__get_cpu_var(softnet_data);
+
+ __get_cpu_var(netdev_rx_stat).total++;
+ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+ if (queue->input_pkt_queue.qlen) {
+enqueue:
+ __skb_queue_tail(&queue->input_pkt_queue, skb);
+ local_irq_restore(flags);
+ return NET_RX_SUCCESS;
+ }
+
+ napi_schedule(&queue->backlog);
+ goto enqueue;
+ }
+
+ __get_cpu_var(netdev_rx_stat).dropped++;
+ local_irq_restore(flags);
+
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+int netif_rx_ni(struct sk_buff *skb)
+{
+ int err;
+
+ preempt_disable();
+ err = netif_rx(skb);
+ if (local_softirq_pending())
+ do_softirq();
+ preempt_enable();
+
+ return err;
+}
+
+EXPORT_SYMBOL(netif_rx_ni);
+
+static void net_tx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = &__get_cpu_var(softnet_data);
+
+ if (sd->completion_queue) {
+ struct sk_buff *clist;
+
+ local_irq_disable();
+ clist = sd->completion_queue;
+ sd->completion_queue = NULL;
+ local_irq_enable();
+
+ while (clist) {
+ struct sk_buff *skb = clist;
+ clist = clist->next;
+
+ WARN_ON(atomic_read(&skb->users));
+ __kfree_skb(skb);
+ }
+ }
+
+ if (sd->output_queue) {
+ struct Qdisc *head;
+
+ local_irq_disable();
+ head = sd->output_queue;
+ sd->output_queue = NULL;
+ local_irq_enable();
+
+ while (head) {
+ struct Qdisc *q = head;
+ spinlock_t *root_lock;
+
+ head = head->next_sched;
+
+ root_lock = qdisc_lock(q);
+ if (spin_trylock(root_lock)) {
+ smp_mb__before_clear_bit();
+ clear_bit(__QDISC_STATE_SCHED,
+ &q->state);
+ qdisc_run(q);
+ spin_unlock(root_lock);
+ } else {
+ if (!test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state)) {
+ __netif_reschedule(q);
+ } else {
+ smp_mb__before_clear_bit();
+ clear_bit(__QDISC_STATE_SCHED,
+ &q->state);
+ }
+ }
+ }
+ }
+}
+
+static inline int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ atomic_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+/* These hooks defined here for ATM */
+struct net_bridge;
+struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
+ unsigned char *addr);
+void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
+
+/*
+ * If bridge module is loaded call bridging hook.
+ * returns NULL if packet was consumed.
+ */
+struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
+ struct sk_buff *skb) __read_mostly;
+static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
+ struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev)
+{
+ struct net_bridge_port *port;
+
+ if (skb->pkt_type == PACKET_LOOPBACK ||
+ (port = rcu_dereference(skb->dev->br_port)) == NULL)
+ return skb;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ return br_handle_frame_hook(port, skb);
+}
+#else
+#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
+#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
+struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
+
+static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret,
+ struct net_device *orig_dev)
+{
+ if (skb->dev->macvlan_port == NULL)
+ return skb;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+ return macvlan_handle_frame_hook(skb);
+}
+#else
+#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
+#ifdef CONFIG_NET_CLS_ACT
+/* TODO: Maybe we should just force sch_ingress to be compiled in
+ * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
+ * a compare and 2 stores extra right now if we dont have it on
+ * but have CONFIG_NET_CLS_ACT
+ * NOTE: This doesnt stop any functionality; if you dont have
+ * the ingress scheduler, you just cant add policies on ingress.
+ *
+ */
+static int ing_filter(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ u32 ttl = G_TC_RTTL(skb->tc_verd);
+ struct netdev_queue *rxq;
+ int result = TC_ACT_OK;
+ struct Qdisc *q;
+
+ if (MAX_RED_LOOP < ttl++) {
+ printk(KERN_WARNING
+ "Redir loop detected Dropping packet (%d->%d)\n",
+ skb->iif, dev->ifindex);
+ return TC_ACT_SHOT;
+ }
+
+ skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+
+ rxq = &dev->rx_queue;
+
+ q = rxq->qdisc;
+ if (q != &noop_qdisc) {
+ spin_lock(qdisc_lock(q));
+ if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
+ result = qdisc_enqueue_root(skb, q);
+ spin_unlock(qdisc_lock(q));
+ }
+
+ return result;
+}
+
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ if (skb->dev->rx_queue.qdisc == &noop_qdisc)
+ goto out;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ } else {
+ /* Huh? Why does turning on AF_PACKET affect this? */
+ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+ }
+
+ switch (ing_filter(skb)) {
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ kfree_skb(skb);
+ return NULL;
+ }
+
+out:
+ skb->tc_verd = 0;
+ return skb;
+}
+#endif
+
+/*
+ * netif_nit_deliver - deliver received packets to network taps
+ * @skb: buffer
+ *
+ * This function is used to deliver incoming packets to network
+ * taps. It should be used when the normal netif_receive_skb path
+ * is bypassed, for example because of VLAN acceleration.
+ */
+void netif_nit_deliver(struct sk_buff *skb)
+{
+ struct packet_type *ptype;
+
+ if (list_empty(&ptype_all))
+ return;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->mac_len = skb->network_header - skb->mac_header;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ if (!ptype->dev || ptype->dev == skb->dev)
+ deliver_skb(skb, ptype, skb->dev);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+ struct packet_type *ptype, *pt_prev;
+ struct net_device *orig_dev;
+ struct net_device *null_or_orig;
+ int ret = NET_RX_DROP;
+ __be16 type;
+
+ if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
+ return NET_RX_SUCCESS;
+
+ /* if we've gotten here through NAPI, check netpoll */
+ if (netpoll_receive_skb(skb))
+ return NET_RX_DROP;
+
+ if (!skb->tstamp.tv64)
+ net_timestamp(skb);
+
+ if (!skb->iif)
+ skb->iif = skb->dev->ifindex;
+
+ null_or_orig = NULL;
+ orig_dev = skb->dev;
+ if (orig_dev->master) {
+ if (skb_bond_should_drop(skb))
+ null_or_orig = orig_dev; /* deliver only exact match */
+ else
+ skb->dev = orig_dev->master;
+ }
+
+ __get_cpu_var(netdev_rx_stat).total++;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->mac_len = skb->network_header - skb->mac_header;
+
+ pt_prev = NULL;
+
+ rcu_read_lock();
+
+ /* Don't receive packets in an exiting network namespace */
+ if (!net_alive(dev_net(skb->dev)))
+ goto out;
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (skb->tc_verd & TC_NCLS) {
+ skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
+ goto ncls;
+ }
+#endif
+
+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
+ ptype->dev == orig_dev) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+ }
+
+#ifdef CONFIG_NET_CLS_ACT
+ skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;
+ncls:
+#endif
+
+ skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;
+ skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;
+
+ type = skb->protocol;
+ list_for_each_entry_rcu(ptype,
+ &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
+ if (ptype->type == type &&
+ (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
+ ptype->dev == orig_dev)) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+ }
+
+ if (pt_prev) {
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ } else {
+ kfree_skb(skb);
+ /* Jamal, now you will not able to escape explaining
+ * me how you were going to use this. :-)
+ */
+ ret = NET_RX_DROP;
+ }
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Network device is going away, flush any packets still pending */
+static void flush_backlog(void *arg)
+{
+ struct net_device *dev = arg;
+ struct softnet_data *queue = &__get_cpu_var(softnet_data);
+ struct sk_buff *skb, *tmp;
+
+ skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
+ if (skb->dev == dev) {
+ __skb_unlink(skb, &queue->input_pkt_queue);
+ kfree_skb(skb);
+ }
+}
+
+static int process_backlog(struct napi_struct *napi, int quota)
+{
+ int work = 0;
+ struct softnet_data *queue = &__get_cpu_var(softnet_data);
+ unsigned long start_time = jiffies;
+
+ napi->weight = weight_p;
+ do {
+ struct sk_buff *skb;
+
+ local_irq_disable();
+ skb = __skb_dequeue(&queue->input_pkt_queue);
+ if (!skb) {
+ __napi_complete(napi);
+ local_irq_enable();
+ break;
+ }
+ local_irq_enable();
+
+ netif_receive_skb(skb);
+ } while (++work < quota && jiffies == start_time);
+
+ return work;
+}
+
+/**
+ * __napi_schedule - schedule for receive
+ * @n: entry to schedule
+ *
+ * The entry's receive function will be scheduled to run
+ */
+void __napi_schedule(struct napi_struct *n)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__napi_schedule);
+
+
+static void net_rx_action(struct softirq_action *h)
+{
+ struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
+ unsigned long start_time = jiffies;
+ int budget = netdev_budget;
+ void *have;
+
+ local_irq_disable();
+
+ while (!list_empty(list)) {
+ struct napi_struct *n;
+ int work, weight;
+
+ /* If softirq window is exhuasted then punt.
+ *
+ * Note that this is a slight policy change from the
+ * previous NAPI code, which would allow up to 2
+ * jiffies to pass before breaking out. The test
+ * used to be "jiffies - start_time > 1".
+ */
+ if (unlikely(budget <= 0 || jiffies != start_time))
+ goto softnet_break;
+
+ local_irq_enable();
+
+ /* Even though interrupts have been re-enabled, this
+ * access is safe because interrupts can only add new
+ * entries to the tail of this list, and only ->poll()
+ * calls can remove this head entry from the list.
+ */
+ n = list_entry(list->next, struct napi_struct, poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidently calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state))
+ work = n->poll(n, weight);
+
+ WARN_ON_ONCE(work > weight);
+
+ budget -= work;
+
+ local_irq_disable();
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(work == weight)) {
+ if (unlikely(napi_disable_pending(n)))
+ __napi_complete(n);
+ else
+ list_move_tail(&n->poll_list, list);
+ }
+
+ netpoll_poll_unlock(have);
+ }
+out:
+ local_irq_enable();
+
+#ifdef CONFIG_NET_DMA
+ /*
+ * There may not be any more sk_buffs coming right now, so push
+ * any pending DMA copies to hardware
+ */
+ if (!cpus_empty(net_dma.channel_mask)) {
+ int chan_idx;
+ for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
+ struct dma_chan *chan = net_dma.channels[chan_idx];
+ if (chan)
+ dma_async_memcpy_issue_pending(chan);
+ }
+ }
+#endif
+
+ return;
+
+softnet_break:
+ __get_cpu_var(netdev_rx_stat).time_squeeze++;
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ goto out;
+}
+
+static gifconf_func_t * gifconf_list [NPROTO];
+
+/**
+ * register_gifconf - register a SIOCGIF handler
+ * @family: Address family
+ * @gifconf: Function handler
+ *
+ * Register protocol dependent address dumping routines. The handler
+ * that is passed must not be freed or reused until it has been replaced
+ * by another handler.
+ */
+int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
+{
+ if (family >= NPROTO)
+ return -EINVAL;
+ gifconf_list[family] = gifconf;
+ return 0;
+}
+
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq __user *arg)
+{
+ struct net_device *dev;
+ struct ifreq ifr;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_index(net, ifr.ifr_ifindex);
+ if (!dev) {
+ read_unlock(&dev_base_lock);
+ return -ENODEV;
+ }
+
+ strcpy(ifr.ifr_name, dev->name);
+ read_unlock(&dev_base_lock);
+
+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+
+static int dev_ifconf(struct net *net, char __user *arg)
+{
+ struct ifconf ifc;
+ struct net_device *dev;
+ char __user *pos;
+ int len;
+ int total;
+ int i;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+
+ /*
+ * Loop over the interfaces, and write an info block for each.
+ */
+
+ total = 0;
+ for_each_netdev(net, dev) {
+ for (i = 0; i < NPROTO; i++) {
+ if (gifconf_list[i]) {
+ int done;
+ if (!pos)
+ done = gifconf_list[i](dev, NULL, 0);
+ else
+ done = gifconf_list[i](dev, pos + total,
+ len - total);
+ if (done < 0)
+ return -EFAULT;
+ total += done;
+ }
+ }
+ }
+
+ /*
+ * All done. Write the updated control block back to the caller.
+ */
+ ifc.ifc_len = total;
+
+ /*
+ * Both BSD and Solaris return 0 here, so we do too.
+ */
+ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * This is invoked by the /proc filesystem handler to display a device
+ * in detail.
+ */
+void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(dev_base_lock)
+{
+ struct net *net = seq_file_net(seq);
+ loff_t off;
+ struct net_device *dev;
+
+ read_lock(&dev_base_lock);
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ off = 1;
+ for_each_netdev(net, dev)
+ if (off++ == *pos)
+ return dev;
+
+ return NULL;
+}
+
+void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ ++*pos;
+ return v == SEQ_START_TOKEN ?
+ first_net_device(net) : next_net_device((struct net_device *)v);
+}
+
+void dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(dev_base_lock)
+{
+ read_unlock(&dev_base_lock);
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+ struct net_device_stats *stats = dev->get_stats(dev);
+
+ seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
+ "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
+ dev->name, stats->rx_bytes, stats->rx_packets,
+ stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors +
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes, stats->tx_packets,
+ stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors +
+ stats->tx_aborted_errors +
+ stats->tx_window_errors +
+ stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized
+ * /proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Inter-| Receive "
+ " | Transmit\n"
+ " face |bytes packets errs drop fifo frame "
+ "compressed multicast|bytes packets errs "
+ "drop fifo colls carrier compressed\n");
+ else
+ dev_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static struct netif_rx_stats *softnet_get_online(loff_t *pos)
+{
+ struct netif_rx_stats *rc = NULL;
+
+ while (*pos < nr_cpu_ids)
+ if (cpu_online(*pos)) {
+ rc = &per_cpu(netdev_rx_stat, *pos);
+ break;
+ } else
+ ++*pos;
+ return rc;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+ struct netif_rx_stats *s = v;
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ s->total, s->dropped, s->time_squeeze, 0,
+ 0, 0, 0, 0, /* was fastroute */
+ s->cpu_collision );
+ return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_seq_show,
+};
+
+static int dev_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+ .start = softnet_seq_start,
+ .next = softnet_seq_next,
+ .stop = softnet_seq_stop,
+ .show = softnet_seq_show,
+};
+
+static int softnet_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &softnet_seq_ops);
+}
+
+static const struct file_operations softnet_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = softnet_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *ptype_get_idx(loff_t pos)
+{
+ struct packet_type *pt = NULL;
+ loff_t i = 0;
+ int t;
+
+ list_for_each_entry_rcu(pt, &ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+ list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+ return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct packet_type *pt;
+ struct list_head *nxt;
+ int hash;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ptype_get_idx(0);
+
+ pt = v;
+ nxt = pt->list.next;
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (nxt != &ptype_all)
+ goto found;
+ hash = 0;
+ nxt = ptype_base[0].next;
+ } else
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+ while (nxt == &ptype_base[hash]) {
+ if (++hash >= PTYPE_HASH_SIZE)
+ return NULL;
+ nxt = ptype_base[hash].next;
+ }
+found:
+ return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void ptype_seq_decode(struct seq_file *seq, void *sym)
+{
+#ifdef CONFIG_KALLSYMS
+ unsigned long offset = 0, symsize;
+ const char *symname;
+ char *modname;
+ char namebuf[128];
+
+ symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
+ &modname, namebuf);
+
+ if (symname) {
+ char *delim = ":";
+
+ if (!modname)
+ modname = delim = "";
+ seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
+ symname, offset);
+ return;
+ }
+#endif
+
+ seq_printf(seq, "[%p]", sym);
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+ struct packet_type *pt = v;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Type Device Function\n");
+ else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ if (pt->type == htons(ETH_P_ALL))
+ seq_puts(seq, "ALL ");
+ else
+ seq_printf(seq, "%04x", ntohs(pt->type));
+
+ seq_printf(seq, " %-8s ",
+ pt->dev ? pt->dev->name : "");
+ ptype_seq_decode(seq, pt->func);
+ seq_putc(seq, '\n');
+ }
+
+ return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+ .start = ptype_seq_start,
+ .next = ptype_seq_next,
+ .stop = ptype_seq_stop,
+ .show = ptype_seq_show,
+};
+
+static int ptype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ptype_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ptype_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ptype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+ int rc = -ENOMEM;
+
+ if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
+ goto out;
+ if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
+ goto out_dev;
+ if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
+ goto out_softnet;
+
+ if (wext_proc_init(net))
+ goto out_ptype;
+ rc = 0;
+out:
+ return rc;
+out_ptype:
+ proc_net_remove(net, "ptype");
+out_softnet:
+ proc_net_remove(net, "softnet_stat");
+out_dev:
+ proc_net_remove(net, "dev");
+ goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+ wext_proc_exit(net);
+
+ proc_net_remove(net, "ptype");
+ proc_net_remove(net, "softnet_stat");
+ proc_net_remove(net, "dev");
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+ .init = dev_proc_net_init,
+ .exit = dev_proc_net_exit,
+};
+
+static int __init dev_proc_init(void)
+{
+ return register_pernet_subsys(&dev_proc_ops);
+}
+#else
+#define dev_proc_init() 0
+#endif /* CONFIG_PROC_FS */
+
+
+/**
+ * netdev_set_master - set up master/slave pair
+ * @slave: slave device
+ * @master: new master device
+ *
+ * Changes the master device of the slave. Pass %NULL to break the
+ * bonding. The caller must hold the RTNL semaphore. On a failure
+ * a negative errno code is returned. On success the reference counts
+ * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
+ * function returns zero.
+ */
+int netdev_set_master(struct net_device *slave, struct net_device *master)
+{
+ struct net_device *old = slave->master;
+
+ ASSERT_RTNL();
+
+ if (master) {
+ if (old)
+ return -EBUSY;
+ dev_hold(master);
+ }
+
+ slave->master = master;
+
+ synchronize_net();
+
+ if (old)
+ dev_put(old);
+
+ if (master)
+ slave->flags |= IFF_SLAVE;
+ else
+ slave->flags &= ~IFF_SLAVE;
+
+ rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
+ return 0;
+}
+
+static void dev_change_rx_flags(struct net_device *dev, int flags)
+{
+ if (dev->flags & IFF_UP && dev->change_rx_flags)
+ dev->change_rx_flags(dev, flags);
+}
+
+static int __dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ unsigned short old_flags = dev->flags;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_PROMISC;
+ dev->promiscuity += inc;
+ if (dev->promiscuity == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch promisc and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_PROMISC;
+ else {
+ dev->promiscuity -= inc;
+ printk(KERN_WARNING "%s: promiscuity touches roof, "
+ "set promiscuity failed, promiscuity feature "
+ "of device might be broken.\n", dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags != old_flags) {
+ printk(KERN_INFO "device %s %s promiscuous mode\n",
+ dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
+ "left");
+ if (audit_enabled)
+ audit_log(current->audit_context, GFP_ATOMIC,
+ AUDIT_ANOM_PROMISCUOUS,
+ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+ dev->name, (dev->flags & IFF_PROMISC),
+ (old_flags & IFF_PROMISC),
+ audit_get_loginuid(current),
+ current->uid, current->gid,
+ audit_get_sessionid(current));
+
+ dev_change_rx_flags(dev, IFF_PROMISC);
+ }
+ return 0;
+}
+
+/**
+ * dev_set_promiscuity - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ unsigned short old_flags = dev->flags;
+ int err;
+
+ err = __dev_set_promiscuity(dev, inc);
+ if (err < 0)
+ return err;
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ return err;
+}
+
+/**
+ * dev_set_allmulti - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ * Return 0 if successful or a negative errno code on error.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ unsigned short old_flags = dev->flags;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_ALLMULTI;
+ dev->allmulti += inc;
+ if (dev->allmulti == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch allmulti and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_ALLMULTI;
+ else {
+ dev->allmulti -= inc;
+ printk(KERN_WARNING "%s: allmulti touches roof, "
+ "set allmulti failed, allmulti feature of "
+ "device might be broken.\n", dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags ^ old_flags) {
+ dev_change_rx_flags(dev, IFF_ALLMULTI);
+ dev_set_rx_mode(dev);
+ }
+ return 0;
+}
+
+/*
+ * Upload unicast and multicast address lists to device and
+ * configure RX filtering. When the device doesn't support unicast
+ * filtering it is put in promiscuous mode while unicast addresses
+ * are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (dev->set_rx_mode)
+ dev->set_rx_mode(dev);
+ else {
+ /* Unicast addresses changes may only happen under the rtnl,
+ * therefore calling __dev_set_promiscuity here is safe.
+ */
+ if (dev->uc_count > 0 && !dev->uc_promisc) {
+ __dev_set_promiscuity(dev, 1);
+ dev->uc_promisc = 1;
+ } else if (dev->uc_count == 0 && dev->uc_promisc) {
+ __dev_set_promiscuity(dev, -1);
+ dev->uc_promisc = 0;
+ }
+
+ if (dev->set_multicast_list)
+ dev->set_multicast_list(dev);
+ }
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+int __dev_addr_delete(struct dev_addr_list **list, int *count,
+ void *addr, int alen, int glbl)
+{
+ struct dev_addr_list *da;
+
+ for (; (da = *list) != NULL; list = &da->next) {
+ if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
+ alen == da->da_addrlen) {
+ if (glbl) {
+ int old_glbl = da->da_gusers;
+ da->da_gusers = 0;
+ if (old_glbl == 0)
+ break;
+ }
+ if (--da->da_users)
+ return 0;
+
+ *list = da->next;
+ kfree(da);
+ (*count)--;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int __dev_addr_add(struct dev_addr_list **list, int *count,
+ void *addr, int alen, int glbl)
+{
+ struct dev_addr_list *da;
+
+ for (da = *list; da != NULL; da = da->next) {
+ if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
+ da->da_addrlen == alen) {
+ if (glbl) {
+ int old_glbl = da->da_gusers;
+ da->da_gusers = 1;
+ if (old_glbl)
+ return 0;
+ }
+ da->da_users++;
+ return 0;
+ }
+ }
+
+ da = kzalloc(sizeof(*da), GFP_ATOMIC);
+ if (da == NULL)
+ return -ENOMEM;
+ memcpy(da->da_addr, addr, alen);
+ da->da_addrlen = alen;
+ da->da_users = 1;
+ da->da_gusers = glbl ? 1 : 0;
+ da->next = *list;
+ *list = da;
+ (*count)++;
+ return 0;
+}
+
+/**
+ * dev_unicast_delete - Release secondary unicast address.
+ * @dev: device
+ * @addr: address to delete
+ * @alen: length of @addr
+ *
+ * Release reference to a secondary unicast address and remove it
+ * from the device if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ netif_addr_lock_bh(dev);
+ err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_unicast_delete);
+
+/**
+ * dev_unicast_add - add a secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ * @alen: length of @addr
+ *
+ * Add a secondary unicast address to the device or increase
+ * the reference count if it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_unicast_add(struct net_device *dev, void *addr, int alen)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ netif_addr_lock_bh(dev);
+ err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_unicast_add);
+
+int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
+ struct dev_addr_list **from, int *from_count)
+{
+ struct dev_addr_list *da, *next;
+ int err = 0;
+
+ da = *from;
+ while (da != NULL) {
+ next = da->next;
+ if (!da->da_synced) {
+ err = __dev_addr_add(to, to_count,
+ da->da_addr, da->da_addrlen, 0);
+ if (err < 0)
+ break;
+ da->da_synced = 1;
+ da->da_users++;
+ } else if (da->da_users == 1) {
+ __dev_addr_delete(to, to_count,
+ da->da_addr, da->da_addrlen, 0);
+ __dev_addr_delete(from, from_count,
+ da->da_addr, da->da_addrlen, 0);
+ }
+ da = next;
+ }
+ return err;
+}
+
+void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
+ struct dev_addr_list **from, int *from_count)
+{
+ struct dev_addr_list *da, *next;
+
+ da = *from;
+ while (da != NULL) {
+ next = da->next;
+ if (da->da_synced) {
+ __dev_addr_delete(to, to_count,
+ da->da_addr, da->da_addrlen, 0);
+ da->da_synced = 0;
+ __dev_addr_delete(from, from_count,
+ da->da_addr, da->da_addrlen, 0);
+ }
+ da = next;
+ }
+}
+
+/**
+ * dev_unicast_sync - Synchronize device's unicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_tx_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices.
+ */
+int dev_unicast_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ netif_addr_lock_bh(to);
+ err = __dev_addr_sync(&to->uc_list, &to->uc_count,
+ &from->uc_list, &from->uc_count);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock_bh(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_unicast_sync);
+
+/**
+ * dev_unicast_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_unicast_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_unicast_unsync(struct net_device *to, struct net_device *from)
+{
+ netif_addr_lock_bh(from);
+ netif_addr_lock(to);
+
+ __dev_addr_unsync(&to->uc_list, &to->uc_count,
+ &from->uc_list, &from->uc_count);
+ __dev_set_rx_mode(to);
+
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_unicast_unsync);
+
+static void __dev_addr_discard(struct dev_addr_list **list)
+{
+ struct dev_addr_list *tmp;
+
+ while (*list != NULL) {
+ tmp = *list;
+ *list = tmp->next;
+ if (tmp->da_users > tmp->da_gusers)
+ printk("__dev_addr_discard: address leakage! "
+ "da_users=%d\n", tmp->da_users);
+ kfree(tmp);
+ }
+}
+
+static void dev_addr_discard(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+
+ __dev_addr_discard(&dev->uc_list);
+ dev->uc_count = 0;
+
+ __dev_addr_discard(&dev->mc_list);
+ dev->mc_count = 0;
+
+ netif_addr_unlock_bh(dev);
+}
+
+/**
+ * dev_get_flags - get flags reported to userspace
+ * @dev: device
+ *
+ * Get the combination of flag bits exported through APIs to userspace.
+ */
+unsigned dev_get_flags(const struct net_device *dev)
+{
+ unsigned flags;
+
+ flags = (dev->flags & ~(IFF_PROMISC |
+ IFF_ALLMULTI |
+ IFF_RUNNING |
+ IFF_LOWER_UP |
+ IFF_DORMANT)) |
+ (dev->gflags & (IFF_PROMISC |
+ IFF_ALLMULTI));
+
+ if (netif_running(dev)) {
+ if (netif_oper_up(dev))
+ flags |= IFF_RUNNING;
+ if (netif_carrier_ok(dev))
+ flags |= IFF_LOWER_UP;
+ if (netif_dormant(dev))
+ flags |= IFF_DORMANT;
+ }
+
+ return flags;
+}
+
+/**
+ * dev_change_flags - change device settings
+ * @dev: device
+ * @flags: device state flags
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ */
+int dev_change_flags(struct net_device *dev, unsigned flags)
+{
+ int ret, changes;
+ int old_flags = dev->flags;
+
+ ASSERT_RTNL();
+
+ /*
+ * Set the flags on our device.
+ */
+
+ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
+ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
+ IFF_AUTOMEDIA)) |
+ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
+ IFF_ALLMULTI));
+
+ /*
+ * Load in the correct multicast list now the flags have changed.
+ */
+
+ if ((old_flags ^ flags) & IFF_MULTICAST)
+ dev_change_rx_flags(dev, IFF_MULTICAST);
+
+ dev_set_rx_mode(dev);
+
+ /*
+ * Have we downed the interface. We handle IFF_UP ourselves
+ * according to user attempts to set it, rather than blindly
+ * setting it.
+ */
+
+ ret = 0;
+ if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
+ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
+
+ if (!ret)
+ dev_set_rx_mode(dev);
+ }
+
+ if (dev->flags & IFF_UP &&
+ ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
+ IFF_VOLATILE)))
+ call_netdevice_notifiers(NETDEV_CHANGE, dev);
+
+ if ((flags ^ dev->gflags) & IFF_PROMISC) {
+ int inc = (flags & IFF_PROMISC) ? +1 : -1;
+ dev->gflags ^= IFF_PROMISC;
+ dev_set_promiscuity(dev, inc);
+ }
+
+ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
+ is important. Some (broken) drivers set IFF_PROMISC, when
+ IFF_ALLMULTI is requested not asking us and not reporting.
+ */
+ if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
+ int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
+ dev->gflags ^= IFF_ALLMULTI;
+ dev_set_allmulti(dev, inc);
+ }
+
+ /* Exclude state transition flags, already notified */
+ changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
+ if (changes)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
+
+ return ret;
+}
+
+/**
+ * dev_set_mtu - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ *
+ * Change the maximum transfer size of the network device.
+ */
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ int err;
+
+ if (new_mtu == dev->mtu)
+ return 0;
+
+ /* MTU must be positive. */
+ if (new_mtu < 0)
+ return -EINVAL;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ err = 0;
+ if (dev->change_mtu)
+ err = dev->change_mtu(dev, new_mtu);
+ else
+ dev->mtu = new_mtu;
+ if (!err && dev->flags & IFF_UP)
+ call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ return err;
+}
+
+/**
+ * dev_set_mac_address - Change Media Access Control Address
+ * @dev: device
+ * @sa: new address
+ *
+ * Change the hardware (MAC) address of the device
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+{
+ int err;
+
+ if (!dev->set_mac_address)
+ return -EOPNOTSUPP;
+ if (sa->sa_family != dev->type)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ err = dev->set_mac_address(dev, sa);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = dev_get_flags(dev);
+ return 0;
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface
+ (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCGIFHWADDR:
+ if (!dev->addr_len)
+ memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+ else
+ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ ifr->ifr_hwaddr.sa_family = dev->type;
+ return 0;
+
+ case SIOCGIFSLAVE:
+ err = -EINVAL;
+ break;
+
+ case SIOCGIFMAP:
+ ifr->ifr_map.mem_start = dev->mem_start;
+ ifr->ifr_map.mem_end = dev->mem_end;
+ ifr->ifr_map.base_addr = dev->base_addr;
+ ifr->ifr_map.irq = dev->irq;
+ ifr->ifr_map.dma = dev->dma;
+ ifr->ifr_map.port = dev->if_port;
+ return 0;
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ default:
+ /* dev_ioctl() should ensure this case
+ * is never reached
+ */
+ WARN_ON(1);
+ err = -EINVAL;
+ break;
+
+ }
+ return err;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags);
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface
+ (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ return dev_set_mtu(dev, ifr->ifr_mtu);
+
+ case SIOCSIFHWADDR:
+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+
+ case SIOCSIFHWBROADCAST:
+ if (ifr->ifr_hwaddr.sa_family != dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return 0;
+
+ case SIOCSIFMAP:
+ if (dev->set_config) {
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev->set_config(dev, &ifr->ifr_map);
+ }
+ return -EOPNOTSUPP;
+
+ case SIOCADDMULTI:
+ if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
+ dev->addr_len, 1);
+
+ case SIOCDELMULTI:
+ if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
+ dev->addr_len, 1);
+
+ case SIOCSIFTXQLEN:
+ if (ifr->ifr_qlen < 0)
+ return -EINVAL;
+ dev->tx_queue_len = ifr->ifr_qlen;
+ return 0;
+
+ case SIOCSIFNAME:
+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+ return dev_change_name(dev, ifr->ifr_newname);
+
+ /*
+ * Unknown or private ioctl
+ */
+
+ default:
+ if ((cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) ||
+ cmd == SIOCBONDENSLAVE ||
+ cmd == SIOCBONDRELEASE ||
+ cmd == SIOCBONDSETHWADDR ||
+ cmd == SIOCBONDSLAVEINFOQUERY ||
+ cmd == SIOCBONDINFOQUERY ||
+ cmd == SIOCBONDCHANGEACTIVE ||
+ cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCBRADDIF ||
+ cmd == SIOCBRDELIF ||
+ cmd == SIOCWANDEV) {
+ err = -EOPNOTSUPP;
+ if (dev->do_ioctl) {
+ if (netif_device_present(dev))
+ err = dev->do_ioctl(dev, ifr,
+ cmd);
+ else
+ err = -ENODEV;
+ }
+ } else
+ err = -EINVAL;
+
+ }
+ return err;
+}
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ * dev_ioctl - network device ioctl
+ * @net: the applicable net namespace
+ * @cmd: command to issue
+ * @arg: pointer to a struct ifreq in user space
+ *
+ * Issue ioctl functions to devices. This is normally called by the
+ * user space syscall interfaces but can sometimes be useful for
+ * other purposes. The return value is the return from the syscall if
+ * positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ int ret;
+ char *colon;
+
+ /* One special case: SIOCGIFCONF takes ifconf argument
+ and requires shared lock, because it sleeps writing
+ to user space.
+ */
+
+ if (cmd == SIOCGIFCONF) {
+ rtnl_lock();
+ ret = dev_ifconf(net, (char __user *) arg);
+ rtnl_unlock();
+ return ret;
+ }
+ if (cmd == SIOCGIFNAME)
+ return dev_ifname(net, (struct ifreq __user *)arg);
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch (cmd) {
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFHWADDR:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(net, ifr.ifr_name);
+ read_lock(&dev_base_lock);
+ ret = dev_ifsioc_locked(net, &ifr, cmd);
+ read_unlock(&dev_base_lock);
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ case SIOCETHTOOL:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ethtool(net, &ifr);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - return a value
+ */
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSIFNAME:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFMAP:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSIFTXQLEN:
+ case SIOCSMIIREG:
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but
+ * currently do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space.
+ * Not applicable in our case */
+ case SIOCSIFLINK:
+ return -EINVAL;
+
+ /*
+ * Unknown or private ioctl.
+ */
+ default:
+ if (cmd == SIOCWANDEV ||
+ (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret && copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ return ret;
+ }
+ /* Take care of Wireless Extensions */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
+ return wext_handle_ioctl(net, &ifr, cmd, arg);
+ return -EINVAL;
+ }
+}
+
+
+/**
+ * dev_new_index - allocate an ifindex
+ * @net: the applicable net namespace
+ *
+ * Returns a suitable unique value for a new device interface
+ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
+ */
+static int dev_new_index(struct net *net)
+{
+ static int ifindex;
+ for (;;) {
+ if (++ifindex <= 0)
+ ifindex = 1;
+ if (!__dev_get_by_index(net, ifindex))
+ return ifindex;
+ }
+}
+
+/* Delayed registration/unregisteration */
+static LIST_HEAD(net_todo_list);
+
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ /* Some devices call without registering for initialization unwind. */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
+ "was registered\n", dev->name, dev);
+
+ WARN_ON(1);
+ return;
+ }
+
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+
+ /* If device is running, close it first. */
+ dev_close(dev);
+
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+
+ dev->reg_state = NETREG_UNREGISTERING;
+
+ synchronize_net();
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+
+ /* Notify protocols, that we are about to destroy
+ this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_addr_discard(dev);
+
+ if (dev->uninit)
+ dev->uninit(dev);
+
+ /* Notifier chain MUST detach us from master device. */
+ WARN_ON(dev->master);
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+
+ synchronize_net();
+
+ dev_put(dev);
+}
+
+static void __netdev_init_queue_locks_one(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_unused)
+{
+ spin_lock_init(&dev_queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
+ dev_queue->xmit_lock_owner = -1;
+}
+
+static void netdev_init_queue_locks(struct net_device *dev)
+{
+ netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
+ __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
+}
+
+unsigned long netdev_fix_features(unsigned long features, const char *name)
+{
+ /* Fix illegal SG+CSUM combinations. */
+ if ((features & NETIF_F_SG) &&
+ !(features & NETIF_F_ALL_CSUM)) {
+ if (name)
+ printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
+ "checksum feature.\n", name);
+ features &= ~NETIF_F_SG;
+ }
+
+ /* TSO requires that SG is present as well. */
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
+ if (name)
+ printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
+ "SG feature.\n", name);
+ features &= ~NETIF_F_TSO;
+ }
+
+ if (features & NETIF_F_UFO) {
+ if (!(features & NETIF_F_GEN_CSUM)) {
+ if (name)
+ printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
+ "since no NETIF_F_HW_CSUM feature.\n",
+ name);
+ features &= ~NETIF_F_UFO;
+ }
+
+ if (!(features & NETIF_F_SG)) {
+ if (name)
+ printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
+ "since no NETIF_F_SG feature.\n", name);
+ features &= ~NETIF_F_UFO;
+ }
+ }
+
+ return features;
+}
+EXPORT_SYMBOL(netdev_fix_features);
+
+/**
+ * register_netdevice - register a network device
+ * @dev: device to register
+ *
+ * Take a completed network device structure and add it to the kernel
+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ * chain. 0 is returned on success. A negative errno code is returned
+ * on a failure to set up the device, or if the name is a duplicate.
+ *
+ * Callers must hold the rtnl semaphore. You may want
+ * register_netdev() instead of this.
+ *
+ * BUGS:
+ * The locking appears insufficient to guarantee two parallel registers
+ * will not get the same name.
+ */
+
+int register_netdevice(struct net_device *dev)
+{
+ struct hlist_head *head;
+ struct hlist_node *p;
+ int ret;
+ struct net *net;
+
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ might_sleep();
+
+ /* When net_device's are persistent, this will be fatal. */
+ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+ BUG_ON(!dev_net(dev));
+ net = dev_net(dev);
+
+ spin_lock_init(&dev->addr_list_lock);
+ netdev_set_addr_lockdep_class(dev);
+ netdev_init_queue_locks(dev);
+
+ dev->iflink = -1;
+
+ /* Init, if this function is available */
+ if (dev->init) {
+ ret = dev->init(dev);
+ if (ret) {
+ if (ret > 0)
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ if (!dev_valid_name(dev->name)) {
+ ret = -EINVAL;
+ goto err_uninit;
+ }
+
+ dev->ifindex = dev_new_index(net);
+ if (dev->iflink == -1)
+ dev->iflink = dev->ifindex;
+
+ /* Check for existence of name */
+ head = dev_name_hash(net, dev->name);
+ hlist_for_each(p, head) {
+ struct net_device *d
+ = hlist_entry(p, struct net_device, name_hlist);
+ if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
+ ret = -EEXIST;
+ goto err_uninit;
+ }
+ }
+
+ /* Fix illegal checksum combinations */
+ if ((dev->features & NETIF_F_HW_CSUM) &&
+ (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
+ dev->name);
+ dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
+
+ if ((dev->features & NETIF_F_NO_CSUM) &&
+ (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
+ dev->name);
+ dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ }
+
+ dev->features = netdev_fix_features(dev->features, dev->name);
+
+ /* Enable software GSO if SG is supported. */
+ if (dev->features & NETIF_F_SG)
+ dev->features |= NETIF_F_GSO;
+
+ netdev_initialize_kobject(dev);
+ ret = netdev_register_kobject(dev);
+ if (ret)
+ goto err_uninit;
+ dev->reg_state = NETREG_REGISTERED;
+
+ /*
+ * Default initial state at registry is that the
+ * device is present.
+ */
+
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+
+ dev_init_scheduler(dev);
+ dev_hold(dev);
+ list_netdevice(dev);
+
+ /* Notify protocols, that a new device appeared. */
+ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ rollback_registered(dev);
+ dev->reg_state = NETREG_UNREGISTERED;
+ }
+
+out:
+ return ret;
+
+err_uninit:
+ if (dev->uninit)
+ dev->uninit(dev);
+ goto out;
+}
+
+/**
+ * register_netdev - register a network device
+ * @dev: device to register
+ *
+ * Take a completed network device structure and add it to the kernel
+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ * chain. 0 is returned on success. A negative errno code is returned
+ * on a failure to set up the device, or if the name is a duplicate.
+ *
+ * This is a wrapper around register_netdevice that takes the rtnl semaphore
+ * and expands the device name if you passed a format string to
+ * alloc_netdev.
+ */
+int register_netdev(struct net_device *dev)
+{
+ int err;
+
+ rtnl_lock();
+
+ /*
+ * If the name is a format string the caller wants us to do a
+ * name allocation.
+ */
+ if (strchr(dev->name, '%')) {
+ err = dev_alloc_name(dev, dev->name);
+ if (err < 0)
+ goto out;
+ }
+
+ err = register_netdevice(dev);
+out:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdev);
+
+/*
+ * netdev_wait_allrefs - wait until all references are gone.
+ *
+ * This is called when unregistering network devices.
+ *
+ * Any protocol or device that holds a reference should register
+ * for netdevice notification, and cleanup and put back the
+ * reference if they receive an UNREGISTER event.
+ * We can get stuck here if buggy protocols don't correctly
+ * call dev_put.
+ */
+static void netdev_wait_allrefs(struct net_device *dev)
+{
+ unsigned long rebroadcast_time, warning_time;
+
+ rebroadcast_time = warning_time = jiffies;
+ while (atomic_read(&dev->refcnt) != 0) {
+ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
+ rtnl_lock();
+
+ /* Rebroadcast unregister notification */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ }
+
+ __rtnl_unlock();
+
+ rebroadcast_time = jiffies;
+ }
+
+ msleep(250);
+
+ if (time_after(jiffies, warning_time + 10 * HZ)) {
+ printk(KERN_EMERG "unregister_netdevice: "
+ "waiting for %s to become free. Usage "
+ "count = %d\n",
+ dev->name, atomic_read(&dev->refcnt));
+ warning_time = jiffies;
+ }
+ }
+}
+
+/* The sequence is:
+ *
+ * rtnl_lock();
+ * ...
+ * register_netdevice(x1);
+ * register_netdevice(x2);
+ * ...
+ * unregister_netdevice(y1);
+ * unregister_netdevice(y2);
+ * ...
+ * rtnl_unlock();
+ * free_netdev(y1);
+ * free_netdev(y2);
+ *
+ * We are invoked by rtnl_unlock().
+ * This allows us to deal with problems:
+ * 1) We can delete sysfs objects which invoke hotplug
+ * without deadlocking with linkwatch via keventd.
+ * 2) Since we run with the RTNL semaphore not held, we can sleep
+ * safely in order to wait for the netdev refcnt to drop to zero.
+ *
+ * We must not return until all unregister events added during
+ * the interval the lock was held have been completed.
+ */
+void netdev_run_todo(void)
+{
+ struct list_head list;
+
+ /* Snapshot list, allow later requests */
+ list_replace_init(&net_todo_list, &list);
+
+ __rtnl_unlock();
+
+ while (!list_empty(&list)) {
+ struct net_device *dev
+ = list_entry(list.next, struct net_device, todo_list);
+ list_del(&dev->todo_list);
+
+ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
+ printk(KERN_ERR "network todo '%s' but state %d\n",
+ dev->name, dev->reg_state);
+ dump_stack();
+ continue;
+ }
+
+ dev->reg_state = NETREG_UNREGISTERED;
+
+ on_each_cpu(flush_backlog, dev, 1);
+
+ netdev_wait_allrefs(dev);
+
+ /* paranoia */
+ BUG_ON(atomic_read(&dev->refcnt));
+ WARN_ON(dev->ip_ptr);
+ WARN_ON(dev->ip6_ptr);
+ WARN_ON(dev->dn_ptr);
+
+ if (dev->destructor)
+ dev->destructor(dev);
+
+ /* Free network device */
+ kobject_put(&dev->dev.kobj);
+ }
+}
+
+static struct net_device_stats *internal_stats(struct net_device *dev)
+{
+ return &dev->stats;
+}
+
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue,
+ void *_unused)
+{
+ queue->dev = dev;
+}
+
+static void netdev_init_queues(struct net_device *dev)
+{
+ netdev_init_one_queue(dev, &dev->rx_queue, NULL);
+ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ spin_lock_init(&dev->tx_global_lock);
+}
+
+/**
+ * alloc_netdev_mq - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @setup: callback to initialize device
+ * @queue_count: the number of subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization. Also allocates subquue structs
+ * for each queue on the device at the end of the netdevice.
+ */
+struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *), unsigned int queue_count)
+{
+ struct netdev_queue *tx;
+ struct net_device *dev;
+ size_t alloc_size;
+ void *p;
+
+ BUG_ON(strlen(name) >= sizeof(dev->name));
+
+ alloc_size = sizeof(struct net_device);
+ if (sizeof_priv) {
+ /* ensure 32-byte alignment of private area */
+ alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
+ alloc_size += sizeof_priv;
+ }
+ /* ensure 32-byte alignment of whole construct */
+ alloc_size += NETDEV_ALIGN_CONST;
+
+ p = kzalloc(alloc_size, GFP_KERNEL);
+ if (!p) {
+ printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
+ return NULL;
+ }
+
+ tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
+ if (!tx) {
+ printk(KERN_ERR "alloc_netdev: Unable to allocate "
+ "tx qdiscs.\n");
+ kfree(p);
+ return NULL;
+ }
+
+ dev = (struct net_device *)
+ (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
+ dev->padded = (char *)dev - (char *)p;
+ dev_net_set(dev, &init_net);
+
+ dev->_tx = tx;
+ dev->num_tx_queues = queue_count;
+ dev->real_num_tx_queues = queue_count;
+
+ if (sizeof_priv) {
+ dev->priv = ((char *)dev +
+ ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
+ & ~NETDEV_ALIGN_CONST));
+ }
+
+ dev->gso_max_size = GSO_MAX_SIZE;
+
+ netdev_init_queues(dev);
+
+ dev->get_stats = internal_stats;
+ netpoll_netdev_init(dev);
+ setup(dev);
+ strcpy(dev->name, name);
+ return dev;
+}
+EXPORT_SYMBOL(alloc_netdev_mq);
+
+/**
+ * free_netdev - free network device
+ * @dev: device
+ *
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released.
+ * If this is the last reference then it will be freed.
+ */
+void free_netdev(struct net_device *dev)
+{
+ release_net(dev_net(dev));
+
+ kfree(dev->_tx);
+
+ /* Compatibility with error handling in drivers */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ kfree((char *)dev - dev->padded);
+ return;
+ }
+
+ BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
+ dev->reg_state = NETREG_RELEASED;
+
+ /* will free via device release */
+ put_device(&dev->dev);
+}
+
+/**
+ * synchronize_net - Synchronize with packet receive processing
+ *
+ * Wait for packets currently being received to be done.
+ * Does not block later packets from starting.
+ */
+void synchronize_net(void)
+{
+ might_sleep();
+ synchronize_rcu();
+}
+
+/**
+ * unregister_netdevice - remove device from the kernel
+ * @dev: device
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ *
+ * Callers must hold the rtnl semaphore. You may want
+ * unregister_netdev() instead of this.
+ */
+
+void unregister_netdevice(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ rollback_registered(dev);
+ /* Finish processing unregister after unlock */
+ net_set_todo(dev);
+}
+
+/**
+ * unregister_netdev - remove device from the kernel
+ * @dev: device
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ *
+ * This is just a wrapper for unregister_netdevice that takes
+ * the rtnl semaphore. In general you want to use this and not
+ * unregister_netdevice.
+ */
+void unregister_netdev(struct net_device *dev)
+{
+ rtnl_lock();
+ unregister_netdevice(dev);
+ rtnl_unlock();
+}
+
+EXPORT_SYMBOL(unregister_netdev);
+
+/**
+ * dev_change_net_namespace - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ */
+
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+ char buf[IFNAMSIZ];
+ const char *destname;
+ int err;
+
+ ASSERT_RTNL();
+
+ /* Don't allow namespace local devices to be moved. */
+ err = -EINVAL;
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ goto out;
+
+ /* Ensure the device has been registrered */
+ err = -EINVAL;
+ if (dev->reg_state != NETREG_REGISTERED)
+ goto out;
+
+ /* Get out if there is nothing todo */
+ err = 0;
+ if (net_eq(dev_net(dev), net))
+ goto out;
+
+ /* Pick the destination device name, and ensure
+ * we can use it in the destination network namespace.
+ */
+ err = -EEXIST;
+ destname = dev->name;
+ if (__dev_get_by_name(net, destname)) {
+ /* We get here if we can't use the current device name */
+ if (!pat)
+ goto out;
+ if (!dev_valid_name(pat))
+ goto out;
+ if (strchr(pat, '%')) {
+ if (__dev_alloc_name(net, pat, buf) < 0)
+ goto out;
+ destname = buf;
+ } else
+ destname = pat;
+ if (__dev_get_by_name(net, destname))
+ goto out;
+ }
+
+ /*
+ * And now a mini version of register_netdevice unregister_netdevice.
+ */
+
+ /* If device is running close it first. */
+ dev_close(dev);
+
+ /* And unlink it from device chain */
+ err = -ENODEV;
+ unlist_netdevice(dev);
+
+ synchronize_net();
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ /* Notify protocols, that we are about to destroy
+ this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_addr_discard(dev);
+
+ /* Actually switch the network namespace */
+ dev_net_set(dev, net);
+
+ /* Assign the new device name */
+ if (destname != dev->name)
+ strcpy(dev->name, destname);
+
+ /* If there is an ifindex conflict assign a new one */
+ if (__dev_get_by_index(net, dev->ifindex)) {
+ int iflink = (dev->iflink == dev->ifindex);
+ dev->ifindex = dev_new_index(net);
+ if (iflink)
+ dev->iflink = dev->ifindex;
+ }
+
+ /* Fixup kobjects */
+ netdev_unregister_kobject(dev);
+ err = netdev_register_kobject(dev);
+ WARN_ON(err);
+
+ /* Add the device back in the hashes */
+ list_netdevice(dev);
+
+ /* Notify protocols, that a new device appeared. */
+ call_netdevice_notifiers(NETDEV_REGISTER, dev);
+
+ synchronize_net();
+ err = 0;
+out:
+ return err;
+}
+
+static int dev_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *ocpu)
+{
+ struct sk_buff **list_skb;
+ struct Qdisc **list_net;
+ struct sk_buff *skb;
+ unsigned int cpu, oldcpu = (unsigned long)ocpu;
+ struct softnet_data *sd, *oldsd;
+
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+ return NOTIFY_OK;
+
+ local_irq_disable();
+ cpu = smp_processor_id();
+ sd = &per_cpu(softnet_data, cpu);
+ oldsd = &per_cpu(softnet_data, oldcpu);
+
+ /* Find end of our completion_queue. */
+ list_skb = &sd->completion_queue;
+ while (*list_skb)
+ list_skb = &(*list_skb)->next;
+ /* Append completion queue from offline CPU. */
+ *list_skb = oldsd->completion_queue;
+ oldsd->completion_queue = NULL;
+
+ /* Find end of our output_queue. */
+ list_net = &sd->output_queue;
+ while (*list_net)
+ list_net = &(*list_net)->next_sched;
+ /* Append output queue from offline CPU. */
+ *list_net = oldsd->output_queue;
+ oldsd->output_queue = NULL;
+
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_enable();
+
+ /* Process offline CPU's input_pkt_queue */
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+ netif_rx(skb);
+
+ return NOTIFY_OK;
+}
+
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance - try to maintain one DMA channel per CPU
+ * @net_dma: DMA client and associated data (lock, channels, channel_mask)
+ *
+ * This is called when the number of channels allocated to the net_dma client
+ * changes. The net_dma client tries to have one DMA channel per CPU.
+ */
+
+static void net_dma_rebalance(struct net_dma *net_dma)
+{
+ unsigned int cpu, i, n, chan_idx;
+ struct dma_chan *chan;
+
+ if (cpus_empty(net_dma->channel_mask)) {
+ for_each_online_cpu(cpu)
+ rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
+ return;
+ }
+
+ i = 0;
+ cpu = first_cpu(cpu_online_map);
+
+ for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
+ chan = net_dma->channels[chan_idx];
+
+ n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
+ + (i < (num_online_cpus() %
+ cpus_weight(net_dma->channel_mask)) ? 1 : 0));
+
+ while(n) {
+ per_cpu(softnet_data, cpu).net_dma = chan;
+ cpu = next_cpu(cpu, cpu_online_map);
+ n--;
+ }
+ i++;
+ }
+}
+
+/**
+ * netdev_dma_event - event callback for the net_dma_client
+ * @client: should always be net_dma_client
+ * @chan: DMA channel for the event
+ * @state: DMA state to be handled
+ */
+static enum dma_state_client
+netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+ enum dma_state state)
+{
+ int i, found = 0, pos = -1;
+ struct net_dma *net_dma =
+ container_of(client, struct net_dma, client);
+ enum dma_state_client ack = DMA_DUP; /* default: take no action */
+
+ spin_lock(&net_dma->lock);
+ switch (state) {
+ case DMA_RESOURCE_AVAILABLE:
+ for (i = 0; i < nr_cpu_ids; i++)
+ if (net_dma->channels[i] == chan) {
+ found = 1;
+ break;
+ } else if (net_dma->channels[i] == NULL && pos < 0)
+ pos = i;
+
+ if (!found && pos >= 0) {
+ ack = DMA_ACK;
+ net_dma->channels[pos] = chan;
+ cpu_set(pos, net_dma->channel_mask);
+ net_dma_rebalance(net_dma);
+ }
+ break;
+ case DMA_RESOURCE_REMOVED:
+ for (i = 0; i < nr_cpu_ids; i++)
+ if (net_dma->channels[i] == chan) {
+ found = 1;
+ pos = i;
+ break;
+ }
+
+ if (found) {
+ ack = DMA_ACK;
+ cpu_clear(pos, net_dma->channel_mask);
+ net_dma->channels[i] = NULL;
+ net_dma_rebalance(net_dma);
+ }
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&net_dma->lock);
+
+ return ack;
+}
+
+/**
+ * netdev_dma_register - register the networking subsystem as a DMA client
+ */
+static int __init netdev_dma_register(void)
+{
+ net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
+ GFP_KERNEL);
+ if (unlikely(!net_dma.channels)) {
+ printk(KERN_NOTICE
+ "netdev_dma: no memory for net_dma.channels\n");
+ return -ENOMEM;
+ }
+ spin_lock_init(&net_dma.lock);
+ dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
+ dma_async_client_register(&net_dma.client);
+ dma_async_client_chan_request(&net_dma.client);
+ return 0;
+}
+
+#else
+static int __init netdev_dma_register(void) { return -ENODEV; }
+#endif /* CONFIG_NET_DMA */
+
+/**
+ * netdev_increment_features - increment feature set by one
+ * @all: current feature set
+ * @one: new feature set
+ * @mask: mask feature set
+ *
+ * Computes a new feature set after adding a device with feature set
+ * @one to the master device with current feature set @all. Will not
+ * enable anything that is off in @mask. Returns the new feature set.
+ */
+unsigned long netdev_increment_features(unsigned long all, unsigned long one,
+ unsigned long mask)
+{
+ /* If device needs checksumming, downgrade to it. */
+ if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
+ all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
+ else if (mask & NETIF_F_ALL_CSUM) {
+ /* If one device supports v4/v6 checksumming, set for all. */
+ if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
+ !(all & NETIF_F_GEN_CSUM)) {
+ all &= ~NETIF_F_ALL_CSUM;
+ all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+ }
+
+ /* If one device supports hw checksumming, set for all. */
+ if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
+ all &= ~NETIF_F_ALL_CSUM;
+ all |= NETIF_F_HW_CSUM;
+ }
+ }
+
+ one |= NETIF_F_ALL_CSUM;
+
+ one |= all & NETIF_F_ONE_FOR_ALL;
+ all &= one | NETIF_F_LLTX | NETIF_F_GSO;
+ all |= one & mask & NETIF_F_ONE_FOR_ALL;
+
+ return all;
+}
+EXPORT_SYMBOL(netdev_increment_features);
+
+static struct hlist_head *netdev_create_hash(void)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < NETDEV_HASHENTRIES; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+/* Initialize per network namespace state */
+static int __net_init netdev_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->dev_base_head);
+
+ net->dev_name_head = netdev_create_hash();
+ if (net->dev_name_head == NULL)
+ goto err_name;
+
+ net->dev_index_head = netdev_create_hash();
+ if (net->dev_index_head == NULL)
+ goto err_idx;
+
+ return 0;
+
+err_idx:
+ kfree(net->dev_name_head);
+err_name:
+ return -ENOMEM;
+}
+
+/**
+ * netdev_drivername - network driver for the device
+ * @dev: network device
+ * @buffer: buffer for resulting name
+ * @len: size of buffer
+ *
+ * Determine network driver for device.
+ */
+char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
+{
+ const struct device_driver *driver;
+ const struct device *parent;
+
+ if (len <= 0 || !buffer)
+ return buffer;
+ buffer[0] = 0;
+
+ parent = dev->dev.parent;
+
+ if (!parent)
+ return buffer;
+
+ driver = parent->driver;
+ if (driver && driver->name)
+ strlcpy(buffer, driver->name, len);
+ return buffer;
+}
+
+static void __net_exit netdev_exit(struct net *net)
+{
+ kfree(net->dev_name_head);
+ kfree(net->dev_index_head);
+}
+
+static struct pernet_operations __net_initdata netdev_net_ops = {
+ .init = netdev_init,
+ .exit = netdev_exit,
+};
+
+static void __net_exit default_device_exit(struct net *net)
+{
+ struct net_device *dev, *next;
+ /*
+ * Push all migratable of the network devices back to the
+ * initial network namespace
+ */
+ rtnl_lock();
+ for_each_netdev_safe(net, dev, next) {
+ int err;
+ char fb_name[IFNAMSIZ];
+
+ /* Ignore unmoveable devices (i.e. loopback) */
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ continue;
+
+ /* Push remaing network devices to init_net */
+ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ err = dev_change_net_namespace(dev, &init_net, fb_name);
+ if (err) {
+ printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
+ __func__, dev->name, err);
+ BUG();
+ }
+ }
+ rtnl_unlock();
+}
+
+static struct pernet_operations __net_initdata default_device_ops = {
+ .exit = default_device_exit,
+};
+
+/*
+ * Initialize the DEV module. At boot time this walks the device list and
+ * unhooks any devices that fail to initialise (normally hardware not
+ * present) and leaves us with a valid list of present and active devices.
+ *
+ */
+
+/*
+ * This is called single threaded during boot, so no need
+ * to take the rtnl semaphore.
+ */
+static int __init net_dev_init(void)
+{
+ int i, rc = -ENOMEM;
+
+ BUG_ON(!dev_boot_phase);
+
+ if (dev_proc_init())
+ goto out;
+
+ if (netdev_kobject_init())
+ goto out;
+
+ INIT_LIST_HEAD(&ptype_all);
+ for (i = 0; i < PTYPE_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&ptype_base[i]);
+
+ if (register_pernet_subsys(&netdev_net_ops))
+ goto out;
+
+ if (register_pernet_device(&default_device_ops))
+ goto out;
+
+ /*
+ * Initialise the packet receive queues.
+ */
+
+ for_each_possible_cpu(i) {
+ struct softnet_data *queue;
+
+ queue = &per_cpu(softnet_data, i);
+ skb_queue_head_init(&queue->input_pkt_queue);
+ queue->completion_queue = NULL;
+ INIT_LIST_HEAD(&queue->poll_list);
+
+ queue->backlog.poll = process_backlog;
+ queue->backlog.weight = weight_p;
+ }
+
+ netdev_dma_register();
+
+ dev_boot_phase = 0;
+
+ open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+ open_softirq(NET_RX_SOFTIRQ, net_rx_action);
+
+ hotcpu_notifier(dev_cpu_callback, 0);
+ dst_init();
+ dev_mcast_init();
+ rc = 0;
+out:
+ return rc;
+}
+
+subsys_initcall(net_dev_init);
+
+EXPORT_SYMBOL(__dev_get_by_index);
+EXPORT_SYMBOL(__dev_get_by_name);
+EXPORT_SYMBOL(__dev_remove_pack);
+EXPORT_SYMBOL(dev_valid_name);
+EXPORT_SYMBOL(dev_add_pack);
+EXPORT_SYMBOL(dev_alloc_name);
+EXPORT_SYMBOL(dev_close);
+EXPORT_SYMBOL(dev_get_by_flags);
+EXPORT_SYMBOL(dev_get_by_index);
+EXPORT_SYMBOL(dev_get_by_name);
+EXPORT_SYMBOL(dev_open);
+EXPORT_SYMBOL(dev_queue_xmit);
+EXPORT_SYMBOL(dev_remove_pack);
+EXPORT_SYMBOL(dev_set_allmulti);
+EXPORT_SYMBOL(dev_set_promiscuity);
+EXPORT_SYMBOL(dev_change_flags);
+EXPORT_SYMBOL(dev_set_mtu);
+EXPORT_SYMBOL(dev_set_mac_address);
+EXPORT_SYMBOL(free_netdev);
+EXPORT_SYMBOL(netdev_boot_setup_check);
+EXPORT_SYMBOL(netdev_set_master);
+EXPORT_SYMBOL(netdev_state_change);
+EXPORT_SYMBOL(netif_receive_skb);
+EXPORT_SYMBOL(netif_rx);
+EXPORT_SYMBOL(register_gifconf);
+EXPORT_SYMBOL(register_netdevice);
+EXPORT_SYMBOL(register_netdevice_notifier);
+EXPORT_SYMBOL(skb_checksum_help);
+EXPORT_SYMBOL(synchronize_net);
+EXPORT_SYMBOL(unregister_netdevice);
+EXPORT_SYMBOL(unregister_netdevice_notifier);
+EXPORT_SYMBOL(net_enable_timestamp);
+EXPORT_SYMBOL(net_disable_timestamp);
+EXPORT_SYMBOL(dev_get_flags);
+
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+EXPORT_SYMBOL(br_handle_frame_hook);
+EXPORT_SYMBOL(br_fdb_get_hook);
+EXPORT_SYMBOL(br_fdb_put_hook);
+#endif
+
+EXPORT_SYMBOL(dev_load);
+
+EXPORT_PER_CPU_SYMBOL(softnet_data);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
new file mode 100644
index 0000000..9e2fa39
--- /dev/null
+++ b/net/core/dev_mcast.c
@@ -0,0 +1,229 @@
+/*
+ * Linux NET3: Multicast List maintenance.
+ *
+ * Authors:
+ * Tim Kordas <tjk@nostromo.eeap.cwru.edu>
+ * Richard Underwood <richard@wuzz.demon.co.uk>
+ *
+ * Stir fried together from the IP multicast and CAP patches above
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * Fixes:
+ * Alan Cox : Update the device on a real delete
+ * rather than any time but...
+ * Alan Cox : IFF_ALLMULTI support.
+ * Alan Cox : New format set_multicast_list() calls.
+ * Gleb Natapov : Remove dev_mc_lock.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+
+
+/*
+ * Device multicast list maintenance.
+ *
+ * This is used both by IP and by the user level maintenance functions.
+ * Unlike BSD we maintain a usage count on a given multicast address so
+ * that a casual user application can add/delete multicasts used by
+ * protocols without doing damage to the protocols when it deletes the
+ * entries. It also helps IP as it tracks overlapping maps.
+ *
+ * Device mc lists are changed by bh at least if IPv6 is enabled,
+ * so that it must be bh protected.
+ *
+ * We block accesses to device mc filters with netif_tx_lock.
+ */
+
+/*
+ * Delete a device level multicast
+ */
+
+int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __dev_addr_delete(&dev->mc_list, &dev->mc_count,
+ addr, alen, glbl);
+ if (!err) {
+ /*
+ * We have altered the list, so the card
+ * loaded filter is now wrong. Fix it
+ */
+
+ __dev_set_rx_mode(dev);
+ }
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+
+/*
+ * Add a device level multicast
+ */
+
+int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+
+/**
+ * dev_mc_sync - Synchronize device's multicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_tx_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_multicast_list
+ * or dev->set_rx_mode function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ netif_addr_lock_bh(to);
+ err = __dev_addr_sync(&to->mc_list, &to->mc_count,
+ &from->mc_list, &from->mc_count);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock_bh(to);
+
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+
+/**
+ * dev_mc_unsync - Remove synchronized addresses from the destination
+ * device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_mc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+ netif_addr_lock_bh(from);
+ netif_addr_lock(to);
+
+ __dev_addr_unsync(&to->mc_list, &to->mc_count,
+ &from->mc_list, &from->mc_count);
+ __dev_set_rx_mode(to);
+
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
+#ifdef CONFIG_PROC_FS
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct dev_addr_list *m;
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ netif_addr_lock_bh(dev);
+ for (m = dev->mc_list; m; m = m->next) {
+ int i;
+
+ seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
+ dev->name, m->dmi_users, m->dmi_gusers);
+
+ for (i = 0; i < m->dmi_addrlen; i++)
+ seq_printf(seq, "%02x", m->dmi_addr[i]);
+
+ seq_putc(seq, '\n');
+ }
+ netif_addr_unlock_bh(dev);
+ return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_mc_seq_show,
+};
+
+static int dev_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_mc_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+ if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+ proc_net_remove(net, "dev_mcast");
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+ .init = dev_mc_net_init,
+ .exit = dev_mc_net_exit,
+};
+
+void __init dev_mcast_init(void)
+{
+ register_pernet_subsys(&dev_mc_net_ops);
+}
+
+EXPORT_SYMBOL(dev_mc_add);
+EXPORT_SYMBOL(dev_mc_delete);
diff --git a/net/core/dst.c b/net/core/dst.c
new file mode 100644
index 0000000..09c1530
--- /dev/null
+++ b/net/core/dst.c
@@ -0,0 +1,347 @@
+/*
+ * net/core/dst.c Protocol independent destination cache.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+
+#include <net/dst.h>
+
+/*
+ * Theory of operations:
+ * 1) We use a list, protected by a spinlock, to add
+ * new entries from both BH and non-BH context.
+ * 2) In order to keep spinlock held for a small delay,
+ * we use a second list where are stored long lived
+ * entries, that are handled by the garbage collect thread
+ * fired by a workqueue.
+ * 3) This list is guarded by a mutex,
+ * so that the gc_task and dst_dev_event() can be synchronized.
+ */
+#if RT_CACHE_DEBUG >= 2
+static atomic_t dst_total = ATOMIC_INIT(0);
+#endif
+
+/*
+ * We want to keep lock & list close together
+ * to dirty as few cache lines as possible in __dst_free().
+ * As this is not a very strong hint, we dont force an alignment on SMP.
+ */
+static struct {
+ spinlock_t lock;
+ struct dst_entry *list;
+ unsigned long timer_inc;
+ unsigned long timer_expires;
+} dst_garbage = {
+ .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
+ .timer_inc = DST_GC_MAX,
+};
+static void dst_gc_task(struct work_struct *work);
+static void ___dst_free(struct dst_entry * dst);
+
+static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
+
+static DEFINE_MUTEX(dst_gc_mutex);
+/*
+ * long lived entries are maintained in this list, guarded by dst_gc_mutex
+ */
+static struct dst_entry *dst_busy_list;
+
+static void dst_gc_task(struct work_struct *work)
+{
+ int delayed = 0;
+ int work_performed = 0;
+ unsigned long expires = ~0L;
+ struct dst_entry *dst, *next, head;
+ struct dst_entry *last = &head;
+#if RT_CACHE_DEBUG >= 2
+ ktime_t time_start = ktime_get();
+ struct timespec elapsed;
+#endif
+
+ mutex_lock(&dst_gc_mutex);
+ next = dst_busy_list;
+
+loop:
+ while ((dst = next) != NULL) {
+ next = dst->next;
+ prefetch(&next->next);
+ if (likely(atomic_read(&dst->__refcnt))) {
+ last->next = dst;
+ last = dst;
+ delayed++;
+ continue;
+ }
+ work_performed++;
+
+ dst = dst_destroy(dst);
+ if (dst) {
+ /* NOHASH and still referenced. Unless it is already
+ * on gc list, invalidate it and add to gc list.
+ *
+ * Note: this is temporary. Actually, NOHASH dst's
+ * must be obsoleted when parent is obsoleted.
+ * But we do not have state "obsoleted, but
+ * referenced by parent", so it is right.
+ */
+ if (dst->obsolete > 1)
+ continue;
+
+ ___dst_free(dst);
+ dst->next = next;
+ next = dst;
+ }
+ }
+
+ spin_lock_bh(&dst_garbage.lock);
+ next = dst_garbage.list;
+ if (next) {
+ dst_garbage.list = NULL;
+ spin_unlock_bh(&dst_garbage.lock);
+ goto loop;
+ }
+ last->next = NULL;
+ dst_busy_list = head.next;
+ if (!dst_busy_list)
+ dst_garbage.timer_inc = DST_GC_MAX;
+ else {
+ /*
+ * if we freed less than 1/10 of delayed entries,
+ * we can sleep longer.
+ */
+ if (work_performed <= delayed/10) {
+ dst_garbage.timer_expires += dst_garbage.timer_inc;
+ if (dst_garbage.timer_expires > DST_GC_MAX)
+ dst_garbage.timer_expires = DST_GC_MAX;
+ dst_garbage.timer_inc += DST_GC_INC;
+ } else {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ }
+ expires = dst_garbage.timer_expires;
+ /*
+ * if the next desired timer is more than 4 seconds in the future
+ * then round the timer to whole seconds
+ */
+ if (expires > 4*HZ)
+ expires = round_jiffies_relative(expires);
+ schedule_delayed_work(&dst_gc_work, expires);
+ }
+
+ spin_unlock_bh(&dst_garbage.lock);
+ mutex_unlock(&dst_gc_mutex);
+#if RT_CACHE_DEBUG >= 2
+ elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start));
+ printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d"
+ " expires: %lu elapsed: %lu us\n",
+ atomic_read(&dst_total), delayed, work_performed,
+ expires,
+ elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC);
+#endif
+}
+
+int dst_discard(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dst_discard);
+
+void * dst_alloc(struct dst_ops * ops)
+{
+ struct dst_entry * dst;
+
+ if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
+ if (ops->gc(ops))
+ return NULL;
+ }
+ dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+ atomic_set(&dst->__refcnt, 0);
+ dst->ops = ops;
+ dst->lastuse = jiffies;
+ dst->path = dst;
+ dst->input = dst->output = dst_discard;
+#if RT_CACHE_DEBUG >= 2
+ atomic_inc(&dst_total);
+#endif
+ atomic_inc(&ops->entries);
+ return dst;
+}
+
+static void ___dst_free(struct dst_entry * dst)
+{
+ /* The first case (dev==NULL) is required, when
+ protocol module is unloaded.
+ */
+ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
+ dst->input = dst->output = dst_discard;
+ }
+ dst->obsolete = 2;
+}
+
+void __dst_free(struct dst_entry * dst)
+{
+ spin_lock_bh(&dst_garbage.lock);
+ ___dst_free(dst);
+ dst->next = dst_garbage.list;
+ dst_garbage.list = dst;
+ if (dst_garbage.timer_inc > DST_GC_INC) {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ cancel_delayed_work(&dst_gc_work);
+ schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires);
+ }
+ spin_unlock_bh(&dst_garbage.lock);
+}
+
+struct dst_entry *dst_destroy(struct dst_entry * dst)
+{
+ struct dst_entry *child;
+ struct neighbour *neigh;
+ struct hh_cache *hh;
+
+ smp_rmb();
+
+again:
+ neigh = dst->neighbour;
+ hh = dst->hh;
+ child = dst->child;
+
+ dst->hh = NULL;
+ if (hh && atomic_dec_and_test(&hh->hh_refcnt))
+ kfree(hh);
+
+ if (neigh) {
+ dst->neighbour = NULL;
+ neigh_release(neigh);
+ }
+
+ atomic_dec(&dst->ops->entries);
+
+ if (dst->ops->destroy)
+ dst->ops->destroy(dst);
+ if (dst->dev)
+ dev_put(dst->dev);
+#if RT_CACHE_DEBUG >= 2
+ atomic_dec(&dst_total);
+#endif
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ dst = child;
+ if (dst) {
+ int nohash = dst->flags & DST_NOHASH;
+
+ if (atomic_dec_and_test(&dst->__refcnt)) {
+ /* We were real parent of this dst, so kill child. */
+ if (nohash)
+ goto again;
+ } else {
+ /* Child is still referenced, return it for freeing. */
+ if (nohash)
+ return dst;
+ /* Child is still in his hash table */
+ }
+ }
+ return NULL;
+}
+
+void dst_release(struct dst_entry *dst)
+{
+ if (dst) {
+ WARN_ON(atomic_read(&dst->__refcnt) < 1);
+ smp_mb__before_atomic_dec();
+ atomic_dec(&dst->__refcnt);
+ }
+}
+EXPORT_SYMBOL(dst_release);
+
+/* Dirty hack. We did it in 2.2 (in __dst_free),
+ * we have _very_ good reasons not to repeat
+ * this mistake in 2.3, but we have no choice
+ * now. _It_ _is_ _explicit_ _deliberate_
+ * _race_ _condition_.
+ *
+ * Commented and originally written by Alexey.
+ */
+static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int unregister)
+{
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, unregister);
+
+ if (dev != dst->dev)
+ return;
+
+ if (!unregister) {
+ dst->input = dst->output = dst_discard;
+ } else {
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+ if (dst->neighbour && dst->neighbour->dev == dev) {
+ dst->neighbour->dev = dst->dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+ }
+ }
+}
+
+static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct dst_entry *dst, *last = NULL;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ case NETDEV_DOWN:
+ mutex_lock(&dst_gc_mutex);
+ for (dst = dst_busy_list; dst; dst = dst->next) {
+ last = dst;
+ dst_ifdown(dst, dev, event != NETDEV_DOWN);
+ }
+
+ spin_lock_bh(&dst_garbage.lock);
+ dst = dst_garbage.list;
+ dst_garbage.list = NULL;
+ spin_unlock_bh(&dst_garbage.lock);
+
+ if (last)
+ last->next = dst;
+ else
+ dst_busy_list = dst;
+ for (; dst; dst = dst->next) {
+ dst_ifdown(dst, dev, event != NETDEV_DOWN);
+ }
+ mutex_unlock(&dst_gc_mutex);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dst_dev_notifier = {
+ .notifier_call = dst_dev_event,
+};
+
+void __init dst_init(void)
+{
+ register_netdevice_notifier(&dst_dev_notifier);
+}
+
+EXPORT_SYMBOL(__dst_free);
+EXPORT_SYMBOL(dst_alloc);
+EXPORT_SYMBOL(dst_destroy);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
new file mode 100644
index 0000000..14ada53
--- /dev/null
+++ b/net/core/ethtool.c
@@ -0,0 +1,1042 @@
+/*
+ * net/core/ethtool.c - Ethtool ioctl handler
+ * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
+ *
+ * This file is where we call all the ethtool_ops commands to get
+ * the information ethtool needs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <asm/uaccess.h>
+
+/*
+ * Some useful ethtool_ops methods that're device independent.
+ * If we find that all drivers want to do the same thing here,
+ * we can turn these into dev_() function calls.
+ */
+
+u32 ethtool_op_get_link(struct net_device *dev)
+{
+ return netif_carrier_ok(dev) ? 1 : 0;
+}
+
+u32 ethtool_op_get_tx_csum(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_ALL_CSUM) != 0;
+}
+
+int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_IP_CSUM;
+ else
+ dev->features &= ~NETIF_F_IP_CSUM;
+
+ return 0;
+}
+
+int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_HW_CSUM;
+ else
+ dev->features &= ~NETIF_F_HW_CSUM;
+
+ return 0;
+}
+
+int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ else
+ dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+
+ return 0;
+}
+
+u32 ethtool_op_get_sg(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_SG) != 0;
+}
+
+int ethtool_op_set_sg(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_SG;
+ else
+ dev->features &= ~NETIF_F_SG;
+
+ return 0;
+}
+
+u32 ethtool_op_get_tso(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_TSO) != 0;
+}
+
+int ethtool_op_set_tso(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_TSO;
+ else
+ dev->features &= ~NETIF_F_TSO;
+
+ return 0;
+}
+
+u32 ethtool_op_get_ufo(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_UFO) != 0;
+}
+
+int ethtool_op_set_ufo(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_UFO;
+ else
+ dev->features &= ~NETIF_F_UFO;
+ return 0;
+}
+
+/* the following list of flags are the same as their associated
+ * NETIF_F_xxx values in include/linux/netdevice.h
+ */
+static const u32 flags_dup_features =
+ ETH_FLAG_LRO;
+
+u32 ethtool_op_get_flags(struct net_device *dev)
+{
+ /* in the future, this function will probably contain additional
+ * handling for flags which are not so easily handled
+ * by a simple masking operation
+ */
+
+ return dev->features & flags_dup_features;
+}
+
+int ethtool_op_set_flags(struct net_device *dev, u32 data)
+{
+ if (data & ETH_FLAG_LRO)
+ dev->features |= NETIF_F_LRO;
+ else
+ dev->features &= ~NETIF_F_LRO;
+
+ return 0;
+}
+
+/* Handlers for each ethtool command */
+
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd = { ETHTOOL_GSET };
+ int err;
+
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ if (!dev->ethtool_ops->set_settings)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_settings(dev, &cmd);
+}
+
+static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_drvinfo info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_drvinfo)
+ return -EOPNOTSUPP;
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GDRVINFO;
+ ops->get_drvinfo(dev, &info);
+
+ if (ops->get_sset_count) {
+ int rc;
+
+ rc = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (rc >= 0)
+ info.testinfo_len = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (rc >= 0)
+ info.n_stats = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (rc >= 0)
+ info.n_priv_flags = rc;
+ } else {
+ /* code path for obsolete hooks */
+
+ if (ops->self_test_count)
+ info.testinfo_len = ops->self_test_count(dev);
+ if (ops->get_stats_count)
+ info.n_stats = ops->get_stats_count(dev);
+ }
+ if (ops->get_regs_len)
+ info.regdump_len = ops->get_regs_len(dev);
+ if (ops->get_eeprom_len)
+ info.eedump_len = ops->get_eeprom_len(dev);
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_rxnfc cmd;
+
+ if (!dev->ethtool_ops->set_rxhash)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_rxhash(dev, &cmd);
+}
+
+static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+
+ if (!dev->ethtool_ops->get_rxhash)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&info, useraddr, sizeof(info)))
+ return -EFAULT;
+
+ dev->ethtool_ops->get_rxhash(dev, &info);
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_regs regs;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *regbuf;
+ int reglen, ret;
+
+ if (!ops->get_regs || !ops->get_regs_len)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&regs, useraddr, sizeof(regs)))
+ return -EFAULT;
+
+ reglen = ops->get_regs_len(dev);
+ if (regs.len > reglen)
+ regs.len = reglen;
+
+ regbuf = kmalloc(reglen, GFP_USER);
+ if (!regbuf)
+ return -ENOMEM;
+
+ ops->get_regs(dev, &regs, regbuf);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &regs, sizeof(regs)))
+ goto out;
+ useraddr += offsetof(struct ethtool_regs, data);
+ if (copy_to_user(useraddr, regbuf, regs.len))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(regbuf);
+ return ret;
+}
+
+static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_wol(dev, &wol);
+
+ if (copy_to_user(useraddr, &wol, sizeof(wol)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol;
+
+ if (!dev->ethtool_ops->set_wol)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&wol, useraddr, sizeof(wol)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_wol(dev, &wol);
+}
+
+static int ethtool_nway_reset(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->nway_reset)
+ return -EOPNOTSUPP;
+
+ return dev->ethtool_ops->nway_reset(dev);
+}
+
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_eeprom eeprom;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (!ops->get_eeprom || !ops->get_eeprom_len)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ return -EINVAL;
+
+ data = kmalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ ret = ops->get_eeprom(dev, &eeprom, data);
+ if (ret)
+ break;
+ if (copy_to_user(userbuf, data, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ eeprom.len = userbuf - (useraddr + sizeof(eeprom));
+ eeprom.offset -= eeprom.len;
+ if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
+ ret = -EFAULT;
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_eeprom eeprom;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (!ops->set_eeprom || !ops->get_eeprom_len)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ return -EINVAL;
+
+ data = kmalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ if (copy_from_user(data, userbuf, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = ops->set_eeprom(dev, &eeprom, data);
+ if (ret)
+ break;
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE };
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_coalesce(dev, &coalesce);
+
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce;
+
+ if (!dev->ethtool_ops->set_coalesce)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+}
+
+static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM };
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_ringparam(dev, &ringparam);
+
+ if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam;
+
+ if (!dev->ethtool_ops->set_ringparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_ringparam(dev, &ringparam);
+}
+
+static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
+
+ if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam;
+
+ if (!dev->ethtool_ops->set_pauseparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+}
+
+static int __ethtool_set_sg(struct net_device *dev, u32 data)
+{
+ int err;
+
+ if (!data && dev->ethtool_ops->set_tso) {
+ err = dev->ethtool_ops->set_tso(dev, 0);
+ if (err)
+ return err;
+ }
+
+ if (!data && dev->ethtool_ops->set_ufo) {
+ err = dev->ethtool_ops->set_ufo(dev, 0);
+ if (err)
+ return err;
+ }
+ return dev->ethtool_ops->set_sg(dev, data);
+}
+
+static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+ int err;
+
+ if (!dev->ethtool_ops->set_tx_csum)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ if (!edata.data && dev->ethtool_ops->set_sg) {
+ err = __ethtool_set_sg(dev, 0);
+ if (err)
+ return err;
+ }
+
+ return dev->ethtool_ops->set_tx_csum(dev, edata.data);
+}
+
+static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+
+ if (!dev->ethtool_ops->set_sg)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ if (edata.data &&
+ !(dev->features & NETIF_F_ALL_CSUM))
+ return -EINVAL;
+
+ return __ethtool_set_sg(dev, edata.data);
+}
+
+static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+
+ if (!dev->ethtool_ops->set_tso)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ if (edata.data && !(dev->features & NETIF_F_SG))
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_tso(dev, edata.data);
+}
+
+static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+
+ if (!dev->ethtool_ops->set_ufo)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+ if (edata.data && !(dev->features & NETIF_F_SG))
+ return -EINVAL;
+ if (edata.data && !(dev->features & NETIF_F_HW_CSUM))
+ return -EINVAL;
+ return dev->ethtool_ops->set_ufo(dev, edata.data);
+}
+
+static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { ETHTOOL_GGSO };
+
+ edata.data = dev->features & NETIF_F_GSO;
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+ if (edata.data)
+ dev->features |= NETIF_F_GSO;
+ else
+ dev->features &= ~NETIF_F_GSO;
+ return 0;
+}
+
+static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_test test;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, test_len;
+
+ if (!ops->self_test)
+ return -EOPNOTSUPP;
+ if (!ops->get_sset_count && !ops->self_test_count)
+ return -EOPNOTSUPP;
+
+ if (ops->get_sset_count)
+ test_len = ops->get_sset_count(dev, ETH_SS_TEST);
+ else
+ /* code path for obsolete hook */
+ test_len = ops->self_test_count(dev);
+ if (test_len < 0)
+ return test_len;
+ WARN_ON(test_len == 0);
+
+ if (copy_from_user(&test, useraddr, sizeof(test)))
+ return -EFAULT;
+
+ test.len = test_len;
+ data = kmalloc(test_len * sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->self_test(dev, &test, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &test, sizeof(test)))
+ goto out;
+ useraddr += sizeof(test);
+ if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gstrings gstrings;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u8 *data;
+ int ret;
+
+ if (!ops->get_strings)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+ return -EFAULT;
+
+ if (ops->get_sset_count) {
+ ret = ops->get_sset_count(dev, gstrings.string_set);
+ if (ret < 0)
+ return ret;
+
+ gstrings.len = ret;
+ } else {
+ /* code path for obsolete hooks */
+
+ switch (gstrings.string_set) {
+ case ETH_SS_TEST:
+ if (!ops->self_test_count)
+ return -EOPNOTSUPP;
+ gstrings.len = ops->self_test_count(dev);
+ break;
+ case ETH_SS_STATS:
+ if (!ops->get_stats_count)
+ return -EOPNOTSUPP;
+ gstrings.len = ops->get_stats_count(dev);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->get_strings(dev, gstrings.string_set, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+ goto out;
+ useraddr += sizeof(gstrings);
+ if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_value id;
+
+ if (!dev->ethtool_ops->phys_id)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&id, useraddr, sizeof(id)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->phys_id(dev, id.data);
+}
+
+static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!ops->get_ethtool_stats)
+ return -EOPNOTSUPP;
+ if (!ops->get_sset_count && !ops->get_stats_count)
+ return -EOPNOTSUPP;
+
+ if (ops->get_sset_count)
+ n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
+ else
+ /* code path for obsolete hook */
+ n_stats = ops->get_stats_count(dev);
+ if (n_stats < 0)
+ return n_stats;
+ WARN_ON(n_stats == 0);
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+ data = kmalloc(n_stats * sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->get_ethtool_stats(dev, &stats, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_perm_addr epaddr;
+
+ if (copy_from_user(&epaddr, useraddr, sizeof(epaddr)))
+ return -EFAULT;
+
+ if (epaddr.size < dev->addr_len)
+ return -ETOOSMALL;
+ epaddr.size = dev->addr_len;
+
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ return -EFAULT;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, dev->perm_addr, epaddr.size))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
+{
+ struct ethtool_value edata = { cmd };
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ edata.data = actor(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr,
+ void (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ actor(dev, edata.data);
+ return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
+ int (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return actor(dev, edata.data);
+}
+
+/* The main entry point in this file. Called from net/core/dev.c */
+
+int dev_ethtool(struct net *net, struct ifreq *ifr)
+{
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ void __user *useraddr = ifr->ifr_data;
+ u32 ethcmd;
+ int rc;
+ unsigned long old_features;
+
+ if (!dev || !netif_device_present(dev))
+ return -ENODEV;
+
+ if (!dev->ethtool_ops)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof (ethcmd)))
+ return -EFAULT;
+
+ /* Allow some commands to be done by anyone */
+ switch(ethcmd) {
+ case ETHTOOL_GDRVINFO:
+ case ETHTOOL_GMSGLVL:
+ case ETHTOOL_GCOALESCE:
+ case ETHTOOL_GRINGPARAM:
+ case ETHTOOL_GPAUSEPARAM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GSTRINGS:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GPERMADDR:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GPFLAGS:
+ case ETHTOOL_GRXFH:
+ break;
+ default:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ }
+
+ if (dev->ethtool_ops->begin)
+ if ((rc = dev->ethtool_ops->begin(dev)) < 0)
+ return rc;
+
+ old_features = dev->features;
+
+ switch (ethcmd) {
+ case ETHTOOL_GSET:
+ rc = ethtool_get_settings(dev, useraddr);
+ break;
+ case ETHTOOL_SSET:
+ rc = ethtool_set_settings(dev, useraddr);
+ break;
+ case ETHTOOL_GDRVINFO:
+ rc = ethtool_get_drvinfo(dev, useraddr);
+ break;
+ case ETHTOOL_GREGS:
+ rc = ethtool_get_regs(dev, useraddr);
+ break;
+ case ETHTOOL_GWOL:
+ rc = ethtool_get_wol(dev, useraddr);
+ break;
+ case ETHTOOL_SWOL:
+ rc = ethtool_set_wol(dev, useraddr);
+ break;
+ case ETHTOOL_GMSGLVL:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_msglevel);
+ break;
+ case ETHTOOL_SMSGLVL:
+ rc = ethtool_set_value_void(dev, useraddr,
+ dev->ethtool_ops->set_msglevel);
+ break;
+ case ETHTOOL_NWAY_RST:
+ rc = ethtool_nway_reset(dev);
+ break;
+ case ETHTOOL_GLINK:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_link);
+ break;
+ case ETHTOOL_GEEPROM:
+ rc = ethtool_get_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_SEEPROM:
+ rc = ethtool_set_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GCOALESCE:
+ rc = ethtool_get_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_SCOALESCE:
+ rc = ethtool_set_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_GRINGPARAM:
+ rc = ethtool_get_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_SRINGPARAM:
+ rc = ethtool_set_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_GPAUSEPARAM:
+ rc = ethtool_get_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_SPAUSEPARAM:
+ rc = ethtool_set_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_GRXCSUM:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_rx_csum);
+ break;
+ case ETHTOOL_SRXCSUM:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_rx_csum);
+ break;
+ case ETHTOOL_GTXCSUM:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ (dev->ethtool_ops->get_tx_csum ?
+ dev->ethtool_ops->get_tx_csum :
+ ethtool_op_get_tx_csum));
+ break;
+ case ETHTOOL_STXCSUM:
+ rc = ethtool_set_tx_csum(dev, useraddr);
+ break;
+ case ETHTOOL_GSG:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ (dev->ethtool_ops->get_sg ?
+ dev->ethtool_ops->get_sg :
+ ethtool_op_get_sg));
+ break;
+ case ETHTOOL_SSG:
+ rc = ethtool_set_sg(dev, useraddr);
+ break;
+ case ETHTOOL_GTSO:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ (dev->ethtool_ops->get_tso ?
+ dev->ethtool_ops->get_tso :
+ ethtool_op_get_tso));
+ break;
+ case ETHTOOL_STSO:
+ rc = ethtool_set_tso(dev, useraddr);
+ break;
+ case ETHTOOL_TEST:
+ rc = ethtool_self_test(dev, useraddr);
+ break;
+ case ETHTOOL_GSTRINGS:
+ rc = ethtool_get_strings(dev, useraddr);
+ break;
+ case ETHTOOL_PHYS_ID:
+ rc = ethtool_phys_id(dev, useraddr);
+ break;
+ case ETHTOOL_GSTATS:
+ rc = ethtool_get_stats(dev, useraddr);
+ break;
+ case ETHTOOL_GPERMADDR:
+ rc = ethtool_get_perm_addr(dev, useraddr);
+ break;
+ case ETHTOOL_GUFO:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ (dev->ethtool_ops->get_ufo ?
+ dev->ethtool_ops->get_ufo :
+ ethtool_op_get_ufo));
+ break;
+ case ETHTOOL_SUFO:
+ rc = ethtool_set_ufo(dev, useraddr);
+ break;
+ case ETHTOOL_GGSO:
+ rc = ethtool_get_gso(dev, useraddr);
+ break;
+ case ETHTOOL_SGSO:
+ rc = ethtool_set_gso(dev, useraddr);
+ break;
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_flags);
+ break;
+ case ETHTOOL_GPFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_priv_flags);
+ break;
+ case ETHTOOL_SPFLAGS:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_priv_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ rc = ethtool_get_rxhash(dev, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ rc = ethtool_set_rxhash(dev, useraddr);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+
+ return rc;
+}
+
+EXPORT_SYMBOL(ethtool_op_get_link);
+EXPORT_SYMBOL(ethtool_op_get_sg);
+EXPORT_SYMBOL(ethtool_op_get_tso);
+EXPORT_SYMBOL(ethtool_op_get_tx_csum);
+EXPORT_SYMBOL(ethtool_op_set_sg);
+EXPORT_SYMBOL(ethtool_op_set_tso);
+EXPORT_SYMBOL(ethtool_op_set_tx_csum);
+EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
+EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
+EXPORT_SYMBOL(ethtool_op_set_ufo);
+EXPORT_SYMBOL(ethtool_op_get_ufo);
+EXPORT_SYMBOL(ethtool_op_set_flags);
+EXPORT_SYMBOL(ethtool_op_get_flags);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
new file mode 100644
index 0000000..79de3b1
--- /dev/null
+++ b/net/core/fib_rules.c
@@ -0,0 +1,685 @@
+/*
+ * net/core/fib_rules.c Generic Routing Rules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/fib_rules.h>
+
+int fib_default_rule_add(struct fib_rules_ops *ops,
+ u32 pref, u32 table, u32 flags)
+{
+ struct fib_rule *r;
+
+ r = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+
+ atomic_set(&r->refcnt, 1);
+ r->action = FR_ACT_TO_TBL;
+ r->pref = pref;
+ r->table = table;
+ r->flags = flags;
+ r->fr_net = hold_net(ops->fro_net);
+
+ /* The lock is not required here, the list in unreacheable
+ * at the moment this function is called */
+ list_add_tail(&r->list, &ops->rules_list);
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid);
+
+static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+{
+ struct fib_rules_ops *ops;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (ops->family == family) {
+ if (!try_module_get(ops->owner))
+ ops = NULL;
+ rcu_read_unlock();
+ return ops;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void rules_ops_put(struct fib_rules_ops *ops)
+{
+ if (ops)
+ module_put(ops->owner);
+}
+
+static void flush_route_cache(struct fib_rules_ops *ops)
+{
+ if (ops->flush_cache)
+ ops->flush_cache(ops);
+}
+
+int fib_rules_register(struct fib_rules_ops *ops)
+{
+ int err = -EEXIST;
+ struct fib_rules_ops *o;
+ struct net *net;
+
+ net = ops->fro_net;
+
+ if (ops->rule_size < sizeof(struct fib_rule))
+ return -EINVAL;
+
+ if (ops->match == NULL || ops->configure == NULL ||
+ ops->compare == NULL || ops->fill == NULL ||
+ ops->action == NULL)
+ return -EINVAL;
+
+ spin_lock(&net->rules_mod_lock);
+ list_for_each_entry(o, &net->rules_ops, list)
+ if (ops->family == o->family)
+ goto errout;
+
+ hold_net(net);
+ list_add_tail_rcu(&ops->list, &net->rules_ops);
+ err = 0;
+errout:
+ spin_unlock(&net->rules_mod_lock);
+
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_register);
+
+void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+{
+ struct fib_rule *rule, *tmp;
+
+ list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
+ list_del_rcu(&rule->list);
+ fib_rule_put(rule);
+ }
+}
+EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
+
+void fib_rules_unregister(struct fib_rules_ops *ops)
+{
+ struct net *net = ops->fro_net;
+
+ spin_lock(&net->rules_mod_lock);
+ list_del_rcu(&ops->list);
+ fib_rules_cleanup_ops(ops);
+ spin_unlock(&net->rules_mod_lock);
+
+ synchronize_rcu();
+ release_net(net);
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
+
+static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
+ struct flowi *fl, int flags)
+{
+ int ret = 0;
+
+ if (rule->ifindex && (rule->ifindex != fl->iif))
+ goto out;
+
+ if ((rule->mark ^ fl->mark) & rule->mark_mask)
+ goto out;
+
+ ret = ops->match(rule, fl, flags);
+out:
+ return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
+}
+
+int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct fib_rule *rule;
+ int err;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+jumped:
+ if (!fib_rule_match(rule, ops, fl, flags))
+ continue;
+
+ if (rule->action == FR_ACT_GOTO) {
+ struct fib_rule *target;
+
+ target = rcu_dereference(rule->ctarget);
+ if (target == NULL) {
+ continue;
+ } else {
+ rule = target;
+ goto jumped;
+ }
+ } else if (rule->action == FR_ACT_NOP)
+ continue;
+ else
+ err = ops->action(rule, fl, flags, arg);
+
+ if (err != -EAGAIN) {
+ fib_rule_get(rule);
+ arg->rule = rule;
+ goto out;
+ }
+ }
+
+ err = -ESRCH;
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_lookup);
+
+static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct fib_rules_ops *ops)
+{
+ int err = -EINVAL;
+
+ if (frh->src_len)
+ if (tb[FRA_SRC] == NULL ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size)
+ goto errout;
+
+ if (frh->dst_len)
+ if (tb[FRA_DST] == NULL ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size)
+ goto errout;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL, unresolved = 0;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ goto errout;
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+ if (err < 0)
+ goto errout;
+
+ err = validate_rulemsg(frh, tb, ops);
+ if (err < 0)
+ goto errout;
+
+ rule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (rule == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ rule->fr_net = hold_net(net);
+
+ if (tb[FRA_PRIORITY])
+ rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+
+ if (tb[FRA_IFNAME]) {
+ struct net_device *dev;
+
+ rule->ifindex = -1;
+ nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, rule->ifname);
+ if (dev)
+ rule->ifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_FWMARK]) {
+ rule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (rule->mark)
+ /* compatibility: if the mark value is non-zero all bits
+ * are compared unless a mask is explicitly specified.
+ */
+ rule->mark_mask = 0xFFFFFFFF;
+ }
+
+ if (tb[FRA_FWMASK])
+ rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+
+ rule->action = frh->action;
+ rule->flags = frh->flags;
+ rule->table = frh_get_table(frh, tb);
+
+ if (!rule->pref && ops->default_pref)
+ rule->pref = ops->default_pref(ops);
+
+ err = -EINVAL;
+ if (tb[FRA_GOTO]) {
+ if (rule->action != FR_ACT_GOTO)
+ goto errout_free;
+
+ rule->target = nla_get_u32(tb[FRA_GOTO]);
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (rule->target <= rule->pref)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ rule->ctarget = r;
+ break;
+ }
+ }
+
+ if (rule->ctarget == NULL)
+ unresolved = 1;
+ } else if (rule->action == FR_ACT_GOTO)
+ goto errout_free;
+
+ err = ops->configure(rule, skb, nlh, frh, tb);
+ if (err < 0)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref > rule->pref)
+ break;
+ last = r;
+ }
+
+ fib_rule_get(rule);
+
+ if (ops->unresolved_rules) {
+ /*
+ * There are unresolved goto rules in the list, check if
+ * any of them are pointing to this new rule.
+ */
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action == FR_ACT_GOTO &&
+ r->target == rule->pref) {
+ BUG_ON(r->ctarget != NULL);
+ rcu_assign_pointer(r->ctarget, rule);
+ if (--ops->unresolved_rules == 0)
+ break;
+ }
+ }
+ }
+
+ if (rule->action == FR_ACT_GOTO)
+ ops->nr_goto_rules++;
+
+ if (unresolved)
+ ops->unresolved_rules++;
+
+ if (last)
+ list_add_rcu(&rule->list, &last->list);
+ else
+ list_add_rcu(&rule->list, &ops->rules_list);
+
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+
+errout_free:
+ release_net(rule->fr_net);
+ kfree(rule);
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule, *tmp;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ goto errout;
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+ if (err < 0)
+ goto errout;
+
+ err = validate_rulemsg(frh, tb, ops);
+ if (err < 0)
+ goto errout;
+
+ list_for_each_entry(rule, &ops->rules_list, list) {
+ if (frh->action && (frh->action != rule->action))
+ continue;
+
+ if (frh->table && (frh_get_table(frh, tb) != rule->table))
+ continue;
+
+ if (tb[FRA_PRIORITY] &&
+ (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
+ continue;
+
+ if (tb[FRA_IFNAME] &&
+ nla_strcmp(tb[FRA_IFNAME], rule->ifname))
+ continue;
+
+ if (tb[FRA_FWMARK] &&
+ (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
+ continue;
+
+ if (tb[FRA_FWMASK] &&
+ (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
+ continue;
+
+ if (!ops->compare(rule, frh, tb))
+ continue;
+
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
+ }
+
+ list_del_rcu(&rule->list);
+
+ if (rule->action == FR_ACT_GOTO)
+ ops->nr_goto_rules--;
+
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules have
+ * actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ list_for_each_entry(tmp, &ops->rules_list, list) {
+ if (tmp->ctarget == rule) {
+ rcu_assign_pointer(tmp->ctarget, NULL);
+ ops->unresolved_rules++;
+ }
+ }
+ }
+
+ synchronize_rcu();
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).pid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+ }
+
+ err = -ENOENT;
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+
+static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ struct fib_rule *rule)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
+ + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */
+ + nla_total_size(4) /* FRA_PRIORITY */
+ + nla_total_size(4) /* FRA_TABLE */
+ + nla_total_size(4) /* FRA_FWMARK */
+ + nla_total_size(4); /* FRA_FWMASK */
+
+ if (ops->nlmsg_payload)
+ payload += ops->nlmsg_payload(rule);
+
+ return payload;
+}
+
+static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
+ u32 pid, u32 seq, int type, int flags,
+ struct fib_rules_ops *ops)
+{
+ struct nlmsghdr *nlh;
+ struct fib_rule_hdr *frh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ frh = nlmsg_data(nlh);
+ frh->table = rule->table;
+ NLA_PUT_U32(skb, FRA_TABLE, rule->table);
+ frh->res1 = 0;
+ frh->res2 = 0;
+ frh->action = rule->action;
+ frh->flags = rule->flags;
+
+ if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL)
+ frh->flags |= FIB_RULE_UNRESOLVED;
+
+ if (rule->ifname[0]) {
+ NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname);
+
+ if (rule->ifindex == -1)
+ frh->flags |= FIB_RULE_DEV_DETACHED;
+ }
+
+ if (rule->pref)
+ NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
+
+ if (rule->mark)
+ NLA_PUT_U32(skb, FRA_FWMARK, rule->mark);
+
+ if (rule->mark_mask || rule->mark)
+ NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask);
+
+ if (rule->target)
+ NLA_PUT_U32(skb, FRA_GOTO, rule->target);
+
+ if (ops->fill(rule, skb, nlh, frh) < 0)
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_rules_ops *ops)
+{
+ int idx = 0;
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, &ops->rules_list, list) {
+ if (idx < cb->args[1])
+ goto skip;
+
+ if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, RTM_NEWRULE,
+ NLM_F_MULTI, ops) < 0)
+ break;
+skip:
+ idx++;
+ }
+ cb->args[1] = idx;
+ rules_ops_put(ops);
+
+ return skb->len;
+}
+
+static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rules_ops *ops;
+ int idx = 0, family;
+
+ family = rtnl_msg_family(cb->nlh);
+ if (family != AF_UNSPEC) {
+ /* Protocol specific dump request */
+ ops = lookup_rules_ops(net, family);
+ if (ops == NULL)
+ return -EAFNOSUPPORT;
+
+ return dump_rules(skb, cb, ops);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (idx < cb->args[0] || !try_module_get(ops->owner))
+ goto skip;
+
+ if (dump_rules(skb, cb, ops) < 0)
+ break;
+
+ cb->args[1] = 0;
+ skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid)
+{
+ struct net *net;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ net = ops->fro_net;
+ skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, ops->nlgroup, err);
+}
+
+static void attach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->ifindex == -1 &&
+ strcmp(dev->name, rule->ifname) == 0)
+ rule->ifindex = dev->ifindex;
+ }
+}
+
+static void detach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list)
+ if (rule->ifindex == dev->ifindex)
+ rule->ifindex = -1;
+}
+
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct net *net = dev_net(dev);
+ struct fib_rules_ops *ops;
+
+ ASSERT_RTNL();
+ rcu_read_lock();
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ attach_rules(&ops->rules_list, dev);
+ break;
+
+ case NETDEV_UNREGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ detach_rules(&ops->rules_list, dev);
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_rules_notifier = {
+ .notifier_call = fib_rules_event,
+};
+
+static int fib_rules_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->rules_ops);
+ spin_lock_init(&net->rules_mod_lock);
+ return 0;
+}
+
+static struct pernet_operations fib_rules_net_ops = {
+ .init = fib_rules_net_init,
+};
+
+static int __init fib_rules_init(void)
+{
+ int err;
+ rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule);
+
+ err = register_netdevice_notifier(&fib_rules_notifier);
+ if (err < 0)
+ goto fail;
+
+ err = register_pernet_subsys(&fib_rules_net_ops);
+ if (err < 0)
+ goto fail_unregister;
+ return 0;
+
+fail_unregister:
+ unregister_netdevice_notifier(&fib_rules_notifier);
+fail:
+ rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ return err;
+}
+
+subsys_initcall(fib_rules_init);
diff --git a/net/core/filter.c b/net/core/filter.c
new file mode 100644
index 0000000..df37443
--- /dev/null
+++ b/net/core/filter.c
@@ -0,0 +1,522 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Author:
+ * Jay Schulist <jschlst@samba.org>
+ *
+ * Based on the design of:
+ * - The Berkeley Packet Filter
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in sk_chk_filter()
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/filter.h>
+
+/* No hurry in this branch */
+static void *__load_pointer(struct sk_buff *skb, int k)
+{
+ u8 *ptr = NULL;
+
+ if (k >= SKF_NET_OFF)
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+ else if (k >= SKF_LL_OFF)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
+ if (ptr >= skb->head && ptr < skb_tail_pointer(skb))
+ return ptr;
+ return NULL;
+}
+
+static inline void *load_pointer(struct sk_buff *skb, int k,
+ unsigned int size, void *buffer)
+{
+ if (k >= 0)
+ return skb_header_pointer(skb, k, size, buffer);
+ else {
+ if (k >= SKF_AD_OFF)
+ return NULL;
+ return __load_pointer(skb, k);
+ }
+}
+
+/**
+ * sk_filter - run a packet through a socket filter
+ * @sk: sock associated with &sk_buff
+ * @skb: buffer to filter
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to sk_run_filter. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ struct sk_filter *filter;
+
+ err = security_sock_rcv_skb(sk, skb);
+ if (err)
+ return err;
+
+ rcu_read_lock_bh();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ unsigned int pkt_len = sk_run_filter(skb, filter->insns,
+ filter->len);
+ err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+ }
+ rcu_read_unlock_bh();
+
+ return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+/**
+ * sk_run_filter - run a filter on a socket
+ * @skb: buffer to run the filter on
+ * @filter: filter to apply
+ * @flen: length of filter
+ *
+ * Decode and apply filter instructions to the skb->data.
+ * Return length to keep, 0 for none. skb is the data we are
+ * filtering, filter is the array of filter instructions, and
+ * len is the number of filter blocks in the array.
+ */
+unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
+{
+ struct sock_filter *fentry; /* We walk down these */
+ void *ptr;
+ u32 A = 0; /* Accumulator */
+ u32 X = 0; /* Index Register */
+ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
+ u32 tmp;
+ int k;
+ int pc;
+
+ /*
+ * Process array of filter instructions.
+ */
+ for (pc = 0; pc < flen; pc++) {
+ fentry = &filter[pc];
+
+ switch (fentry->code) {
+ case BPF_ALU|BPF_ADD|BPF_X:
+ A += X;
+ continue;
+ case BPF_ALU|BPF_ADD|BPF_K:
+ A += fentry->k;
+ continue;
+ case BPF_ALU|BPF_SUB|BPF_X:
+ A -= X;
+ continue;
+ case BPF_ALU|BPF_SUB|BPF_K:
+ A -= fentry->k;
+ continue;
+ case BPF_ALU|BPF_MUL|BPF_X:
+ A *= X;
+ continue;
+ case BPF_ALU|BPF_MUL|BPF_K:
+ A *= fentry->k;
+ continue;
+ case BPF_ALU|BPF_DIV|BPF_X:
+ if (X == 0)
+ return 0;
+ A /= X;
+ continue;
+ case BPF_ALU|BPF_DIV|BPF_K:
+ A /= fentry->k;
+ continue;
+ case BPF_ALU|BPF_AND|BPF_X:
+ A &= X;
+ continue;
+ case BPF_ALU|BPF_AND|BPF_K:
+ A &= fentry->k;
+ continue;
+ case BPF_ALU|BPF_OR|BPF_X:
+ A |= X;
+ continue;
+ case BPF_ALU|BPF_OR|BPF_K:
+ A |= fentry->k;
+ continue;
+ case BPF_ALU|BPF_LSH|BPF_X:
+ A <<= X;
+ continue;
+ case BPF_ALU|BPF_LSH|BPF_K:
+ A <<= fentry->k;
+ continue;
+ case BPF_ALU|BPF_RSH|BPF_X:
+ A >>= X;
+ continue;
+ case BPF_ALU|BPF_RSH|BPF_K:
+ A >>= fentry->k;
+ continue;
+ case BPF_ALU|BPF_NEG:
+ A = -A;
+ continue;
+ case BPF_JMP|BPF_JA:
+ pc += fentry->k;
+ continue;
+ case BPF_JMP|BPF_JGT|BPF_K:
+ pc += (A > fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JGE|BPF_K:
+ pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ pc += (A == fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JSET|BPF_K:
+ pc += (A & fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JGT|BPF_X:
+ pc += (A > X) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JGE|BPF_X:
+ pc += (A >= X) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ pc += (A == X) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_JMP|BPF_JSET|BPF_X:
+ pc += (A & X) ? fentry->jt : fentry->jf;
+ continue;
+ case BPF_LD|BPF_W|BPF_ABS:
+ k = fentry->k;
+load_w:
+ ptr = load_pointer(skb, k, 4, &tmp);
+ if (ptr != NULL) {
+ A = get_unaligned_be32(ptr);
+ continue;
+ }
+ break;
+ case BPF_LD|BPF_H|BPF_ABS:
+ k = fentry->k;
+load_h:
+ ptr = load_pointer(skb, k, 2, &tmp);
+ if (ptr != NULL) {
+ A = get_unaligned_be16(ptr);
+ continue;
+ }
+ break;
+ case BPF_LD|BPF_B|BPF_ABS:
+ k = fentry->k;
+load_b:
+ ptr = load_pointer(skb, k, 1, &tmp);
+ if (ptr != NULL) {
+ A = *(u8 *)ptr;
+ continue;
+ }
+ break;
+ case BPF_LD|BPF_W|BPF_LEN:
+ A = skb->len;
+ continue;
+ case BPF_LDX|BPF_W|BPF_LEN:
+ X = skb->len;
+ continue;
+ case BPF_LD|BPF_W|BPF_IND:
+ k = X + fentry->k;
+ goto load_w;
+ case BPF_LD|BPF_H|BPF_IND:
+ k = X + fentry->k;
+ goto load_h;
+ case BPF_LD|BPF_B|BPF_IND:
+ k = X + fentry->k;
+ goto load_b;
+ case BPF_LDX|BPF_B|BPF_MSH:
+ ptr = load_pointer(skb, fentry->k, 1, &tmp);
+ if (ptr != NULL) {
+ X = (*(u8 *)ptr & 0xf) << 2;
+ continue;
+ }
+ return 0;
+ case BPF_LD|BPF_IMM:
+ A = fentry->k;
+ continue;
+ case BPF_LDX|BPF_IMM:
+ X = fentry->k;
+ continue;
+ case BPF_LD|BPF_MEM:
+ A = mem[fentry->k];
+ continue;
+ case BPF_LDX|BPF_MEM:
+ X = mem[fentry->k];
+ continue;
+ case BPF_MISC|BPF_TAX:
+ X = A;
+ continue;
+ case BPF_MISC|BPF_TXA:
+ A = X;
+ continue;
+ case BPF_RET|BPF_K:
+ return fentry->k;
+ case BPF_RET|BPF_A:
+ return A;
+ case BPF_ST:
+ mem[fentry->k] = A;
+ continue;
+ case BPF_STX:
+ mem[fentry->k] = X;
+ continue;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * Handle ancillary data, which are impossible
+ * (or very difficult) to get parsing packet contents.
+ */
+ switch (k-SKF_AD_OFF) {
+ case SKF_AD_PROTOCOL:
+ A = ntohs(skb->protocol);
+ continue;
+ case SKF_AD_PKTTYPE:
+ A = skb->pkt_type;
+ continue;
+ case SKF_AD_IFINDEX:
+ A = skb->dev->ifindex;
+ continue;
+ case SKF_AD_NLATTR: {
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+ if (A > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *)&skb->data[A],
+ skb->len - A, X);
+ if (nla)
+ A = (void *)nla - (void *)skb->data;
+ else
+ A = 0;
+ continue;
+ }
+ default:
+ return 0;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sk_run_filter);
+
+/**
+ * sk_chk_filter - verify socket filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Check the user's filter code. If we let some ugly
+ * filter code slip through kaboom! The filter must contain
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
+ *
+ * All jumps are forward as they are not signed.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+int sk_chk_filter(struct sock_filter *filter, int flen)
+{
+ struct sock_filter *ftest;
+ int pc;
+
+ if (flen == 0 || flen > BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* check the filter code now */
+ for (pc = 0; pc < flen; pc++) {
+ ftest = &filter[pc];
+
+ /* Only allow valid instructions */
+ switch (ftest->code) {
+ case BPF_ALU|BPF_ADD|BPF_K:
+ case BPF_ALU|BPF_ADD|BPF_X:
+ case BPF_ALU|BPF_SUB|BPF_K:
+ case BPF_ALU|BPF_SUB|BPF_X:
+ case BPF_ALU|BPF_MUL|BPF_K:
+ case BPF_ALU|BPF_MUL|BPF_X:
+ case BPF_ALU|BPF_DIV|BPF_X:
+ case BPF_ALU|BPF_AND|BPF_K:
+ case BPF_ALU|BPF_AND|BPF_X:
+ case BPF_ALU|BPF_OR|BPF_K:
+ case BPF_ALU|BPF_OR|BPF_X:
+ case BPF_ALU|BPF_LSH|BPF_K:
+ case BPF_ALU|BPF_LSH|BPF_X:
+ case BPF_ALU|BPF_RSH|BPF_K:
+ case BPF_ALU|BPF_RSH|BPF_X:
+ case BPF_ALU|BPF_NEG:
+ case BPF_LD|BPF_W|BPF_ABS:
+ case BPF_LD|BPF_H|BPF_ABS:
+ case BPF_LD|BPF_B|BPF_ABS:
+ case BPF_LD|BPF_W|BPF_LEN:
+ case BPF_LD|BPF_W|BPF_IND:
+ case BPF_LD|BPF_H|BPF_IND:
+ case BPF_LD|BPF_B|BPF_IND:
+ case BPF_LD|BPF_IMM:
+ case BPF_LDX|BPF_W|BPF_LEN:
+ case BPF_LDX|BPF_B|BPF_MSH:
+ case BPF_LDX|BPF_IMM:
+ case BPF_MISC|BPF_TAX:
+ case BPF_MISC|BPF_TXA:
+ case BPF_RET|BPF_K:
+ case BPF_RET|BPF_A:
+ break;
+
+ /* Some instructions need special checks */
+
+ case BPF_ALU|BPF_DIV|BPF_K:
+ /* check for division by zero */
+ if (ftest->k == 0)
+ return -EINVAL;
+ break;
+
+ case BPF_LD|BPF_MEM:
+ case BPF_LDX|BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* check for invalid memory addresses */
+ if (ftest->k >= BPF_MEMWORDS)
+ return -EINVAL;
+ break;
+
+ case BPF_JMP|BPF_JA:
+ /*
+ * Note, the large ftest->k might cause loops.
+ * Compare this with conditional jumps below,
+ * where offsets are limited. --ANK (981016)
+ */
+ if (ftest->k >= (unsigned)(flen-pc-1))
+ return -EINVAL;
+ break;
+
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ case BPF_JMP|BPF_JGE|BPF_K:
+ case BPF_JMP|BPF_JGE|BPF_X:
+ case BPF_JMP|BPF_JGT|BPF_K:
+ case BPF_JMP|BPF_JGT|BPF_X:
+ case BPF_JMP|BPF_JSET|BPF_K:
+ case BPF_JMP|BPF_JSET|BPF_X:
+ /* for conditionals both must be safe */
+ if (pc + ftest->jt + 1 >= flen ||
+ pc + ftest->jf + 1 >= flen)
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(sk_chk_filter);
+
+/**
+ * sk_filter_rcu_release: Release a socket filter by rcu_head
+ * @rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_rcu_release(struct rcu_head *rcu)
+{
+ struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+ sk_filter_release(fp);
+}
+
+static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ unsigned int size = sk_filter_len(fp);
+
+ atomic_sub(size, &sk->sk_omem_alloc);
+ call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
+}
+
+/**
+ * sk_attach_filter - attach a socket filter
+ * @fprog: the filter program
+ * @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+ unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+ int err;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+ sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+ return -EFAULT;
+ }
+
+ atomic_set(&fp->refcnt, 1);
+ fp->len = fprog->len;
+
+ err = sk_chk_filter(fp->insns, fp->len);
+ if (err) {
+ sk_filter_uncharge(sk, fp);
+ return err;
+ }
+
+ rcu_read_lock_bh();
+ old_fp = rcu_dereference(sk->sk_filter);
+ rcu_assign_pointer(sk->sk_filter, fp);
+ rcu_read_unlock_bh();
+
+ if (old_fp)
+ sk_filter_delayed_uncharge(sk, old_fp);
+ return 0;
+}
+
+int sk_detach_filter(struct sock *sk)
+{
+ int ret = -ENOENT;
+ struct sk_filter *filter;
+
+ rcu_read_lock_bh();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ rcu_assign_pointer(sk->sk_filter, NULL);
+ sk_filter_delayed_uncharge(sk, filter);
+ ret = 0;
+ }
+ rcu_read_unlock_bh();
+ return ret;
+}
diff --git a/net/core/flow.c b/net/core/flow.c
new file mode 100644
index 0000000..5cf8105
--- /dev/null
+++ b/net/core/flow.c
@@ -0,0 +1,368 @@
+/* flow.c: Generic flow cache.
+ *
+ * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/completion.h>
+#include <linux/percpu.h>
+#include <linux/bitops.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/mutex.h>
+#include <net/flow.h>
+#include <asm/atomic.h>
+#include <linux/security.h>
+
+struct flow_cache_entry {
+ struct flow_cache_entry *next;
+ u16 family;
+ u8 dir;
+ u32 genid;
+ struct flowi key;
+ void *object;
+ atomic_t *object_ref;
+};
+
+atomic_t flow_cache_genid = ATOMIC_INIT(0);
+
+static u32 flow_hash_shift;
+#define flow_hash_size (1 << flow_hash_shift)
+static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
+
+#define flow_table(cpu) (per_cpu(flow_tables, cpu))
+
+static struct kmem_cache *flow_cachep __read_mostly;
+
+static int flow_lwm, flow_hwm;
+
+struct flow_percpu_info {
+ int hash_rnd_recalc;
+ u32 hash_rnd;
+ int count;
+};
+static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
+
+#define flow_hash_rnd_recalc(cpu) \
+ (per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
+#define flow_hash_rnd(cpu) \
+ (per_cpu(flow_hash_info, cpu).hash_rnd)
+#define flow_count(cpu) \
+ (per_cpu(flow_hash_info, cpu).count)
+
+static struct timer_list flow_hash_rnd_timer;
+
+#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
+
+struct flow_flush_info {
+ atomic_t cpuleft;
+ struct completion completion;
+};
+static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
+
+#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+
+static void flow_cache_new_hashrnd(unsigned long arg)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ flow_hash_rnd_recalc(i) = 1;
+
+ flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&flow_hash_rnd_timer);
+}
+
+static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
+{
+ if (fle->object)
+ atomic_dec(fle->object_ref);
+ kmem_cache_free(flow_cachep, fle);
+ flow_count(cpu)--;
+}
+
+static void __flow_cache_shrink(int cpu, int shrink_to)
+{
+ struct flow_cache_entry *fle, **flp;
+ int i;
+
+ for (i = 0; i < flow_hash_size; i++) {
+ int k = 0;
+
+ flp = &flow_table(cpu)[i];
+ while ((fle = *flp) != NULL && k < shrink_to) {
+ k++;
+ flp = &fle->next;
+ }
+ while ((fle = *flp) != NULL) {
+ *flp = fle->next;
+ flow_entry_kill(cpu, fle);
+ }
+ }
+}
+
+static void flow_cache_shrink(int cpu)
+{
+ int shrink_to = flow_lwm / flow_hash_size;
+
+ __flow_cache_shrink(cpu, shrink_to);
+}
+
+static void flow_new_hash_rnd(int cpu)
+{
+ get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
+ flow_hash_rnd_recalc(cpu) = 0;
+
+ __flow_cache_shrink(cpu, 0);
+}
+
+static u32 flow_hash_code(struct flowi *key, int cpu)
+{
+ u32 *k = (u32 *) key;
+
+ return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
+ (flow_hash_size - 1));
+}
+
+#if (BITS_PER_LONG == 64)
+typedef u64 flow_compare_t;
+#else
+typedef u32 flow_compare_t;
+#endif
+
+/* I hear what you're saying, use memcmp. But memcmp cannot make
+ * important assumptions that we can here, such as alignment and
+ * constant size.
+ */
+static int flow_key_compare(struct flowi *key1, struct flowi *key2)
+{
+ flow_compare_t *k1, *k1_lim, *k2;
+ const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
+
+ BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t));
+
+ k1 = (flow_compare_t *) key1;
+ k1_lim = k1 + n_elem;
+
+ k2 = (flow_compare_t *) key2;
+
+ do {
+ if (*k1++ != *k2++)
+ return 1;
+ } while (k1 < k1_lim);
+
+ return 0;
+}
+
+void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
+ flow_resolve_t resolver)
+{
+ struct flow_cache_entry *fle, **head;
+ unsigned int hash;
+ int cpu;
+
+ local_bh_disable();
+ cpu = smp_processor_id();
+
+ fle = NULL;
+ /* Packet really early in init? Making flow_cache_init a
+ * pre-smp initcall would solve this. --RR */
+ if (!flow_table(cpu))
+ goto nocache;
+
+ if (flow_hash_rnd_recalc(cpu))
+ flow_new_hash_rnd(cpu);
+ hash = flow_hash_code(key, cpu);
+
+ head = &flow_table(cpu)[hash];
+ for (fle = *head; fle; fle = fle->next) {
+ if (fle->family == family &&
+ fle->dir == dir &&
+ flow_key_compare(key, &fle->key) == 0) {
+ if (fle->genid == atomic_read(&flow_cache_genid)) {
+ void *ret = fle->object;
+
+ if (ret)
+ atomic_inc(fle->object_ref);
+ local_bh_enable();
+
+ return ret;
+ }
+ break;
+ }
+ }
+
+ if (!fle) {
+ if (flow_count(cpu) > flow_hwm)
+ flow_cache_shrink(cpu);
+
+ fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
+ if (fle) {
+ fle->next = *head;
+ *head = fle;
+ fle->family = family;
+ fle->dir = dir;
+ memcpy(&fle->key, key, sizeof(*key));
+ fle->object = NULL;
+ flow_count(cpu)++;
+ }
+ }
+
+nocache:
+ {
+ int err;
+ void *obj;
+ atomic_t *obj_ref;
+
+ err = resolver(key, family, dir, &obj, &obj_ref);
+
+ if (fle && !err) {
+ fle->genid = atomic_read(&flow_cache_genid);
+
+ if (fle->object)
+ atomic_dec(fle->object_ref);
+
+ fle->object = obj;
+ fle->object_ref = obj_ref;
+ if (obj)
+ atomic_inc(fle->object_ref);
+ }
+ local_bh_enable();
+
+ if (err)
+ obj = ERR_PTR(err);
+ return obj;
+ }
+}
+
+static void flow_cache_flush_tasklet(unsigned long data)
+{
+ struct flow_flush_info *info = (void *)data;
+ int i;
+ int cpu;
+
+ cpu = smp_processor_id();
+ for (i = 0; i < flow_hash_size; i++) {
+ struct flow_cache_entry *fle;
+
+ fle = flow_table(cpu)[i];
+ for (; fle; fle = fle->next) {
+ unsigned genid = atomic_read(&flow_cache_genid);
+
+ if (!fle->object || fle->genid == genid)
+ continue;
+
+ fle->object = NULL;
+ atomic_dec(fle->object_ref);
+ }
+ }
+
+ if (atomic_dec_and_test(&info->cpuleft))
+ complete(&info->completion);
+}
+
+static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
+static void flow_cache_flush_per_cpu(void *data)
+{
+ struct flow_flush_info *info = data;
+ int cpu;
+ struct tasklet_struct *tasklet;
+
+ cpu = smp_processor_id();
+
+ tasklet = flow_flush_tasklet(cpu);
+ tasklet->data = (unsigned long)info;
+ tasklet_schedule(tasklet);
+}
+
+void flow_cache_flush(void)
+{
+ struct flow_flush_info info;
+ static DEFINE_MUTEX(flow_flush_sem);
+
+ /* Don't want cpus going down or up during this. */
+ get_online_cpus();
+ mutex_lock(&flow_flush_sem);
+ atomic_set(&info.cpuleft, num_online_cpus());
+ init_completion(&info.completion);
+
+ local_bh_disable();
+ smp_call_function(flow_cache_flush_per_cpu, &info, 0);
+ flow_cache_flush_tasklet((unsigned long)&info);
+ local_bh_enable();
+
+ wait_for_completion(&info.completion);
+ mutex_unlock(&flow_flush_sem);
+ put_online_cpus();
+}
+
+static void __devinit flow_cache_cpu_prepare(int cpu)
+{
+ struct tasklet_struct *tasklet;
+ unsigned long order;
+
+ for (order = 0;
+ (PAGE_SIZE << order) <
+ (sizeof(struct flow_cache_entry *)*flow_hash_size);
+ order++)
+ /* NOTHING */;
+
+ flow_table(cpu) = (struct flow_cache_entry **)
+ __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ if (!flow_table(cpu))
+ panic("NET: failed to allocate flow cache order %lu\n", order);
+
+ flow_hash_rnd_recalc(cpu) = 1;
+ flow_count(cpu) = 0;
+
+ tasklet = flow_flush_tasklet(cpu);
+ tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
+}
+
+static int flow_cache_cpu(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+ __flow_cache_shrink((unsigned long)hcpu, 0);
+ return NOTIFY_OK;
+}
+
+static int __init flow_cache_init(void)
+{
+ int i;
+
+ flow_cachep = kmem_cache_create("flow_cache",
+ sizeof(struct flow_cache_entry),
+ 0, SLAB_PANIC,
+ NULL);
+ flow_hash_shift = 10;
+ flow_lwm = 2 * flow_hash_size;
+ flow_hwm = 4 * flow_hash_size;
+
+ setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
+ flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&flow_hash_rnd_timer);
+
+ for_each_possible_cpu(i)
+ flow_cache_cpu_prepare(i);
+
+ hotcpu_notifier(flow_cache_cpu, 0);
+ return 0;
+}
+
+module_init(flow_cache_init);
+
+EXPORT_SYMBOL(flow_cache_genid);
+EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
new file mode 100644
index 0000000..57abe82
--- /dev/null
+++ b/net/core/gen_estimator.c
@@ -0,0 +1,266 @@
+/*
+ * net/sched/gen_estimator.c Simple rate estimator.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Jamal Hadi Salim - moved it to net/core and reshulfed
+ * names to make it usable in general net subsystem.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <net/gen_stats.h>
+
+/*
+ This code is NOT intended to be used for statistics collection,
+ its purpose is to provide a base for statistical multiplexing
+ for controlled load service.
+ If you need only statistics, run a user level daemon which
+ periodically reads byte counters.
+
+ Unfortunately, rate estimation is not a very easy task.
+ F.e. I did not find a simple way to estimate the current peak rate
+ and even failed to formulate the problem 8)8)
+
+ So I preferred not to built an estimator into the scheduler,
+ but run this task separately.
+ Ideally, it should be kernel thread(s), but for now it runs
+ from timers, which puts apparent top bounds on the number of rated
+ flows, has minimal overhead on small, but is enough
+ to handle controlled load service, sets of aggregates.
+
+ We measure rate over A=(1<<interval) seconds and evaluate EWMA:
+
+ avrate = avrate*(1-W) + rate*W
+
+ where W is chosen as negative power of 2: W = 2^(-ewma_log)
+
+ The resulting time constant is:
+
+ T = A/(-ln(1-W))
+
+
+ NOTES.
+
+ * The stored value for avbps is scaled by 2^5, so that maximal
+ rate is ~1Gbit, avpps is scaled by 2^10.
+
+ * Minimal interval is HZ/4=250msec (it is the greatest common divisor
+ for HZ=100 and HZ=1024 8)), maximal interval
+ is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
+ are too expensive, longer ones can be implemented
+ at user level painlessly.
+ */
+
+#define EST_MAX_INTERVAL 5
+
+struct gen_estimator
+{
+ struct list_head list;
+ struct gnet_stats_basic *bstats;
+ struct gnet_stats_rate_est *rate_est;
+ spinlock_t *stats_lock;
+ int ewma_log;
+ u64 last_bytes;
+ u32 last_packets;
+ u32 avpps;
+ u32 avbps;
+ struct rcu_head e_rcu;
+};
+
+struct gen_estimator_head
+{
+ struct timer_list timer;
+ struct list_head list;
+};
+
+static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
+
+/* Protects against NULL dereference */
+static DEFINE_RWLOCK(est_lock);
+
+static void est_timer(unsigned long arg)
+{
+ int idx = (int)arg;
+ struct gen_estimator *e;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &elist[idx].list, list) {
+ u64 nbytes;
+ u32 npackets;
+ u32 rate;
+
+ spin_lock(e->stats_lock);
+ read_lock(&est_lock);
+ if (e->bstats == NULL)
+ goto skip;
+
+ nbytes = e->bstats->bytes;
+ npackets = e->bstats->packets;
+ rate = (nbytes - e->last_bytes)<<(7 - idx);
+ e->last_bytes = nbytes;
+ e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
+ e->rate_est->bps = (e->avbps+0xF)>>5;
+
+ rate = (npackets - e->last_packets)<<(12 - idx);
+ e->last_packets = npackets;
+ e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
+ e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+ read_unlock(&est_lock);
+ spin_unlock(e->stats_lock);
+ }
+
+ if (!list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+ rcu_read_unlock();
+}
+
+/**
+ * gen_new_estimator - create a new rate estimator
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ * @stats_lock: statistics lock
+ * @opt: rate estimator configuration TLV
+ *
+ * Creates a new rate estimator with &bstats as source and &rate_est
+ * as destination. A new timer with the interval specified in the
+ * configuration TLV is created. Upon each interval, the latest statistics
+ * will be read from &bstats and the estimated rate will be stored in
+ * &rate_est with the statistics lock grabed during this period.
+ *
+ * Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
+ */
+int gen_new_estimator(struct gnet_stats_basic *bstats,
+ struct gnet_stats_rate_est *rate_est,
+ spinlock_t *stats_lock,
+ struct nlattr *opt)
+{
+ struct gen_estimator *est;
+ struct gnet_estimator *parm = nla_data(opt);
+ int idx;
+
+ if (nla_len(opt) < sizeof(*parm))
+ return -EINVAL;
+
+ if (parm->interval < -2 || parm->interval > 3)
+ return -EINVAL;
+
+ est = kzalloc(sizeof(*est), GFP_KERNEL);
+ if (est == NULL)
+ return -ENOBUFS;
+
+ idx = parm->interval + 2;
+ est->bstats = bstats;
+ est->rate_est = rate_est;
+ est->stats_lock = stats_lock;
+ est->ewma_log = parm->ewma_log;
+ est->last_bytes = bstats->bytes;
+ est->avbps = rate_est->bps<<5;
+ est->last_packets = bstats->packets;
+ est->avpps = rate_est->pps<<10;
+
+ if (!elist[idx].timer.function) {
+ INIT_LIST_HEAD(&elist[idx].list);
+ setup_timer(&elist[idx].timer, est_timer, idx);
+ }
+
+ if (list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+
+ list_add_rcu(&est->list, &elist[idx].list);
+ return 0;
+}
+
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+ struct gen_estimator *e = container_of(head,
+ struct gen_estimator, e_rcu);
+ kfree(e);
+}
+
+/**
+ * gen_kill_estimator - remove a rate estimator
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ *
+ * Removes the rate estimator specified by &bstats and &rate_est
+ * and deletes the timer.
+ *
+ * NOTE: Called under rtnl_mutex
+ */
+void gen_kill_estimator(struct gnet_stats_basic *bstats,
+ struct gnet_stats_rate_est *rate_est)
+{
+ int idx;
+ struct gen_estimator *e, *n;
+
+ for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
+
+ /* Skip non initialized indexes */
+ if (!elist[idx].timer.function)
+ continue;
+
+ list_for_each_entry_safe(e, n, &elist[idx].list, list) {
+ if (e->rate_est != rate_est || e->bstats != bstats)
+ continue;
+
+ write_lock_bh(&est_lock);
+ e->bstats = NULL;
+ write_unlock_bh(&est_lock);
+
+ list_del_rcu(&e->list);
+ call_rcu(&e->e_rcu, __gen_kill_estimator);
+ }
+ }
+}
+
+/**
+ * gen_replace_estimator - replace rate estimator configuration
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ * @stats_lock: statistics lock
+ * @opt: rate estimator configuration TLV
+ *
+ * Replaces the configuration of a rate estimator by calling
+ * gen_kill_estimator() and gen_new_estimator().
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int gen_replace_estimator(struct gnet_stats_basic *bstats,
+ struct gnet_stats_rate_est *rate_est,
+ spinlock_t *stats_lock, struct nlattr *opt)
+{
+ gen_kill_estimator(bstats, rate_est);
+ return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+}
+
+
+EXPORT_SYMBOL(gen_kill_estimator);
+EXPORT_SYMBOL(gen_new_estimator);
+EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
new file mode 100644
index 0000000..c3d0ffe
--- /dev/null
+++ b/net/core/gen_stats.c
@@ -0,0 +1,241 @@
+/*
+ * net/core/gen_stats.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Jamal Hadi Salim
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * See Documentation/networking/gen_stats.txt
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/gen_stats.h>
+#include <net/netlink.h>
+#include <net/gen_stats.h>
+
+
+static inline int
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
+{
+ NLA_PUT(d->skb, type, size, buf);
+ return 0;
+
+nla_put_failure:
+ spin_unlock_bh(d->lock);
+ return -1;
+}
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
+ * @xstats_type: TLV type for backward compatibility xstats TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * The dumping handle is marked to be in backward compatibility mode telling
+ * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
+ int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+ __acquires(lock)
+{
+ memset(d, 0, sizeof(*d));
+
+ spin_lock_bh(lock);
+ d->lock = lock;
+ if (type)
+ d->tail = (struct nlattr *)skb_tail_pointer(skb);
+ d->skb = skb;
+ d->compat_tc_stats = tc_stats_type;
+ d->compat_xstats = xstats_type;
+
+ if (d->tail)
+ return gnet_stats_copy(d, type, NULL, 0);
+
+ return 0;
+}
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
+ struct gnet_dump *d)
+{
+ return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
+}
+
+/**
+ * gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @d: dumping handle
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b)
+{
+ if (d->compat_tc_stats) {
+ d->tc_stats.bytes = b->bytes;
+ d->tc_stats.packets = b->packets;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b));
+
+ return 0;
+}
+
+/**
+ * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
+ * @d: dumping handle
+ * @r: rate estimator statistics
+ *
+ * Appends the rate estimator statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r)
+{
+ if (d->compat_tc_stats) {
+ d->tc_stats.bps = r->bps;
+ d->tc_stats.pps = r->pps;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
+
+ return 0;
+}
+
+/**
+ * gnet_stats_copy_queue - copy queue statistics into statistics TLV
+ * @d: dumping handle
+ * @q: queue statistics
+ *
+ * Appends the queue statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
+{
+ if (d->compat_tc_stats) {
+ d->tc_stats.drops = q->drops;
+ d->tc_stats.qlen = q->qlen;
+ d->tc_stats.backlog = q->backlog;
+ d->tc_stats.overlimits = q->overlimits;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q));
+
+ return 0;
+}
+
+/**
+ * gnet_stats_copy_app - copy application specific statistics into statistics TLV
+ * @d: dumping handle
+ * @st: application specific statistics data
+ * @len: length of data
+ *
+ * Appends the application sepecific statistics to the top level TLV created by
+ * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
+ * handle is in backward compatibility mode.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
+{
+ if (d->compat_xstats) {
+ d->xstats = st;
+ d->xstats_len = len;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_APP, st, len);
+
+ return 0;
+}
+
+/**
+ * gnet_stats_finish_copy - finish dumping procedure
+ * @d: dumping handle
+ *
+ * Corrects the length of the top level TLV to include all TLVs added
+ * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
+ * if gnet_stats_start_copy_compat() was used and releases the statistics
+ * lock.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_finish_copy(struct gnet_dump *d)
+{
+ if (d->tail)
+ d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
+
+ if (d->compat_tc_stats)
+ if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
+ sizeof(d->tc_stats)) < 0)
+ return -1;
+
+ if (d->compat_xstats && d->xstats) {
+ if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
+ d->xstats_len) < 0)
+ return -1;
+ }
+
+ spin_unlock_bh(d->lock);
+ return 0;
+}
+
+
+EXPORT_SYMBOL(gnet_stats_start_copy);
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
+EXPORT_SYMBOL(gnet_stats_copy_basic);
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+EXPORT_SYMBOL(gnet_stats_copy_queue);
+EXPORT_SYMBOL(gnet_stats_copy_app);
+EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/iovec.c b/net/core/iovec.c
new file mode 100644
index 0000000..4c9c012
--- /dev/null
+++ b/net/core/iovec.c
@@ -0,0 +1,238 @@
+/*
+ * iovec manipulation routines.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Andrew Lunn : Errors in iovec copying.
+ * Pedro Roque : Added memcpy_fromiovecend and
+ * csum_..._fromiovecend.
+ * Andi Kleen : fixed error handling for 2.1
+ * Alexey Kuznetsov: 2.1 optimisations
+ * Andi Kleen : Fix csum*fromiovecend for IPv6.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <net/checksum.h>
+#include <net/sock.h>
+
+/*
+ * Verify iovec. The caller must ensure that the iovec is big enough
+ * to hold the message iovec.
+ *
+ * Save time not doing access_ok. copy_*_user will make this work
+ * in any case.
+ */
+
+int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
+{
+ int size, err, ct;
+
+ if (m->msg_namelen) {
+ if (mode == VERIFY_READ) {
+ err = move_addr_to_kernel(m->msg_name, m->msg_namelen,
+ address);
+ if (err < 0)
+ return err;
+ }
+ m->msg_name = address;
+ } else {
+ m->msg_name = NULL;
+ }
+
+ size = m->msg_iovlen * sizeof(struct iovec);
+ if (copy_from_user(iov, m->msg_iov, size))
+ return -EFAULT;
+
+ m->msg_iov = iov;
+ err = 0;
+
+ for (ct = 0; ct < m->msg_iovlen; ct++) {
+ err += iov[ct].iov_len;
+ /*
+ * Goal is not to verify user data, but to prevent returning
+ * negative value, which is interpreted as errno.
+ * Overflow is still possible, but it is harmless.
+ */
+ if (err < 0)
+ return -EMSGSIZE;
+ }
+
+ return err;
+}
+
+/*
+ * Copy kernel to iovec. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+ while (len > 0) {
+ if (iov->iov_len) {
+ int copy = min_t(unsigned int, iov->iov_len, len);
+ if (copy_to_user(iov->iov_base, kdata, copy))
+ return -EFAULT;
+ kdata += copy;
+ len -= copy;
+ iov->iov_len -= copy;
+ iov->iov_base += copy;
+ }
+ iov++;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy iovec to kernel. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+ while (len > 0) {
+ if (iov->iov_len) {
+ int copy = min_t(unsigned int, len, iov->iov_len);
+ if (copy_from_user(kdata, iov->iov_base, copy))
+ return -EFAULT;
+ len -= copy;
+ kdata += copy;
+ iov->iov_base += copy;
+ iov->iov_len -= copy;
+ }
+ iov++;
+ }
+
+ return 0;
+}
+
+/*
+ * For use with ip_build_xmit
+ */
+int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
+ int len)
+{
+ /* Skip over the finished iovecs */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ }
+
+ while (len > 0) {
+ u8 __user *base = iov->iov_base + offset;
+ int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+ offset = 0;
+ if (copy_from_user(kdata, base, copy))
+ return -EFAULT;
+ len -= copy;
+ kdata += copy;
+ iov++;
+ }
+
+ return 0;
+}
+
+/*
+ * And now for the all-in-one: copy and checksum from a user iovec
+ * directly to a datagram
+ * Calls to csum_partial but the last must be in 32 bit chunks
+ *
+ * ip_build_xmit must ensure that when fragmenting only the last
+ * call to this function will be unaligned also.
+ */
+int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
+ int offset, unsigned int len, __wsum *csump)
+{
+ __wsum csum = *csump;
+ int partial_cnt = 0, err = 0;
+
+ /* Skip over the finished iovecs */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ }
+
+ while (len > 0) {
+ u8 __user *base = iov->iov_base + offset;
+ int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+ offset = 0;
+
+ /* There is a remnant from previous iov. */
+ if (partial_cnt) {
+ int par_len = 4 - partial_cnt;
+
+ /* iov component is too short ... */
+ if (par_len > copy) {
+ if (copy_from_user(kdata, base, copy))
+ goto out_fault;
+ kdata += copy;
+ base += copy;
+ partial_cnt += copy;
+ len -= copy;
+ iov++;
+ if (len)
+ continue;
+ *csump = csum_partial(kdata - partial_cnt,
+ partial_cnt, csum);
+ goto out;
+ }
+ if (copy_from_user(kdata, base, par_len))
+ goto out_fault;
+ csum = csum_partial(kdata - partial_cnt, 4, csum);
+ kdata += par_len;
+ base += par_len;
+ copy -= par_len;
+ len -= par_len;
+ partial_cnt = 0;
+ }
+
+ if (len > copy) {
+ partial_cnt = copy % 4;
+ if (partial_cnt) {
+ copy -= partial_cnt;
+ if (copy_from_user(kdata + copy, base + copy,
+ partial_cnt))
+ goto out_fault;
+ }
+ }
+
+ if (copy) {
+ csum = csum_and_copy_from_user(base, kdata, copy,
+ csum, &err);
+ if (err)
+ goto out;
+ }
+ len -= copy + partial_cnt;
+ kdata += copy + partial_cnt;
+ iov++;
+ }
+ *csump = csum;
+out:
+ return err;
+
+out_fault:
+ err = -EFAULT;
+ goto out;
+}
+
+EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
+EXPORT_SYMBOL(memcpy_fromiovec);
+EXPORT_SYMBOL(memcpy_fromiovecend);
+EXPORT_SYMBOL(memcpy_toiovec);
diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h
new file mode 100644
index 0000000..283c2b9
--- /dev/null
+++ b/net/core/kmap_skb.h
@@ -0,0 +1,19 @@
+#include <linux/highmem.h>
+
+static inline void *kmap_skb_frag(const skb_frag_t *frag)
+{
+#ifdef CONFIG_HIGHMEM
+ BUG_ON(in_irq());
+
+ local_bh_disable();
+#endif
+ return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ);
+}
+
+static inline void kunmap_skb_frag(void *vaddr)
+{
+ kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ);
+#ifdef CONFIG_HIGHMEM
+ local_bh_enable();
+#endif
+}
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
new file mode 100644
index 0000000..bf8f7af
--- /dev/null
+++ b/net/core/link_watch.c
@@ -0,0 +1,228 @@
+/*
+ * Linux network device link state notification
+ *
+ * Author:
+ * Stefan Rompf <sux@loplof.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <asm/types.h>
+
+
+enum lw_bits {
+ LW_URGENT = 0,
+};
+
+static unsigned long linkwatch_flags;
+static unsigned long linkwatch_nextevent;
+
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
+
+static struct net_device *lweventlist;
+static DEFINE_SPINLOCK(lweventlist_lock);
+
+static unsigned char default_operstate(const struct net_device *dev)
+{
+ if (!netif_carrier_ok(dev))
+ return (dev->ifindex != dev->iflink ?
+ IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+
+ if (netif_dormant(dev))
+ return IF_OPER_DORMANT;
+
+ return IF_OPER_UP;
+}
+
+
+static void rfc2863_policy(struct net_device *dev)
+{
+ unsigned char operstate = default_operstate(dev);
+
+ if (operstate == dev->operstate)
+ return;
+
+ write_lock_bh(&dev_base_lock);
+
+ switch(dev->link_mode) {
+ case IF_LINK_MODE_DORMANT:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_DORMANT;
+ break;
+
+ case IF_LINK_MODE_DEFAULT:
+ default:
+ break;
+ }
+
+ dev->operstate = operstate;
+
+ write_unlock_bh(&dev_base_lock);
+}
+
+
+static bool linkwatch_urgent_event(struct net_device *dev)
+{
+ return netif_running(dev) && netif_carrier_ok(dev) &&
+ qdisc_tx_changing(dev);
+}
+
+
+static void linkwatch_add_event(struct net_device *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ dev->link_watch_next = lweventlist;
+ lweventlist = dev;
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
+
+
+static void linkwatch_schedule_work(int urgent)
+{
+ unsigned long delay = linkwatch_nextevent - jiffies;
+
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ return;
+
+ /* Minimise down-time: drop delay for up event. */
+ if (urgent) {
+ if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
+ return;
+ delay = 0;
+ }
+
+ /* If we wrap around we'll delay it by at most HZ. */
+ if (delay > HZ)
+ delay = 0;
+
+ /*
+ * This is true if we've scheduled it immeditately or if we don't
+ * need an immediate execution and it's already pending.
+ */
+ if (schedule_delayed_work(&linkwatch_work, delay) == !delay)
+ return;
+
+ /* Don't bother if there is nothing urgent. */
+ if (!test_bit(LW_URGENT, &linkwatch_flags))
+ return;
+
+ /* It's already running which is good enough. */
+ if (!cancel_delayed_work(&linkwatch_work))
+ return;
+
+ /* Otherwise we reschedule it again for immediate exection. */
+ schedule_delayed_work(&linkwatch_work, 0);
+}
+
+
+static void __linkwatch_run_queue(int urgent_only)
+{
+ struct net_device *next;
+
+ /*
+ * Limit the number of linkwatch events to one
+ * per second so that a runaway driver does not
+ * cause a storm of messages on the netlink
+ * socket. This limit does not apply to up events
+ * while the device qdisc is down.
+ */
+ if (!urgent_only)
+ linkwatch_nextevent = jiffies + HZ;
+ /* Limit wrap-around effect on delay. */
+ else if (time_after(linkwatch_nextevent, jiffies + HZ))
+ linkwatch_nextevent = jiffies;
+
+ clear_bit(LW_URGENT, &linkwatch_flags);
+
+ spin_lock_irq(&lweventlist_lock);
+ next = lweventlist;
+ lweventlist = NULL;
+ spin_unlock_irq(&lweventlist_lock);
+
+ while (next) {
+ struct net_device *dev = next;
+
+ next = dev->link_watch_next;
+
+ if (urgent_only && !linkwatch_urgent_event(dev)) {
+ linkwatch_add_event(dev);
+ continue;
+ }
+
+ /*
+ * Make sure the above read is complete since it can be
+ * rewritten as soon as we clear the bit below.
+ */
+ smp_mb__before_clear_bit();
+
+ /* We are about to handle this device,
+ * so new events can be accepted
+ */
+ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+ rfc2863_policy(dev);
+ if (dev->flags & IFF_UP) {
+ if (netif_carrier_ok(dev))
+ dev_activate(dev);
+ else
+ dev_deactivate(dev);
+
+ netdev_state_change(dev);
+ }
+
+ dev_put(dev);
+ }
+
+ if (lweventlist)
+ linkwatch_schedule_work(0);
+}
+
+
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
+{
+ __linkwatch_run_queue(0);
+}
+
+
+static void linkwatch_event(struct work_struct *dummy)
+{
+ rtnl_lock();
+ __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
+ rtnl_unlock();
+}
+
+
+void linkwatch_fire_event(struct net_device *dev)
+{
+ bool urgent = linkwatch_urgent_event(dev);
+
+ if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
+ dev_hold(dev);
+
+ linkwatch_add_event(dev);
+ } else if (!urgent)
+ return;
+
+ linkwatch_schedule_work(urgent);
+}
+
+EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
new file mode 100644
index 0000000..1dc728b
--- /dev/null
+++ b/net/core/neighbour.c
@@ -0,0 +1,2831 @@
+/*
+ * Generic address resolution entity
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
+ * Harald Welte Add neighbour cache statistics like rtstat
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/times.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/log2.h>
+
+#define NEIGH_DEBUG 1
+
+#define NEIGH_PRINTK(x...) printk(x)
+#define NEIGH_NOPRINTK(x...) do { ; } while(0)
+#define NEIGH_PRINTK0 NEIGH_PRINTK
+#define NEIGH_PRINTK1 NEIGH_NOPRINTK
+#define NEIGH_PRINTK2 NEIGH_NOPRINTK
+
+#if NEIGH_DEBUG >= 1
+#undef NEIGH_PRINTK1
+#define NEIGH_PRINTK1 NEIGH_PRINTK
+#endif
+#if NEIGH_DEBUG >= 2
+#undef NEIGH_PRINTK2
+#define NEIGH_PRINTK2 NEIGH_PRINTK
+#endif
+
+#define PNEIGH_HASHMASK 0xF
+
+static void neigh_timer_handler(unsigned long arg);
+static void __neigh_notify(struct neighbour *n, int type, int flags);
+static void neigh_update_notify(struct neighbour *neigh);
+static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+
+static struct neigh_table *neigh_tables;
+#ifdef CONFIG_PROC_FS
+static const struct file_operations neigh_stat_seq_fops;
+#endif
+
+/*
+ Neighbour hash table buckets are protected with rwlock tbl->lock.
+
+ - All the scans/updates to hash buckets MUST be made under this lock.
+ - NOTHING clever should be made under this lock: no callbacks
+ to protocol backends, no attempts to send something to network.
+ It will result in deadlocks, if backend/driver wants to use neighbour
+ cache.
+ - If the entry requires some non-trivial actions, increase
+ its reference count and release table lock.
+
+ Neighbour entries are protected:
+ - with reference count.
+ - with rwlock neigh->lock
+
+ Reference count prevents destruction.
+
+ neigh->lock mainly serializes ll address data and its validity state.
+ However, the same lock is used to protect another entry fields:
+ - timer
+ - resolution queue
+
+ Again, nothing clever shall be made under neigh->lock,
+ the most complicated procedure, which we allow is dev->hard_header.
+ It is supposed, that dev->hard_header is simplistic and does
+ not make callbacks to neighbour tables.
+
+ The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
+ list of neighbour tables. This list is used only in process context,
+ */
+
+static DEFINE_RWLOCK(neigh_tbl_lock);
+
+static int neigh_blackhole(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static void neigh_cleanup_and_release(struct neighbour *neigh)
+{
+ if (neigh->parms->neigh_cleanup)
+ neigh->parms->neigh_cleanup(neigh);
+
+ __neigh_notify(neigh, RTM_DELNEIGH, 0);
+ neigh_release(neigh);
+}
+
+/*
+ * It is random distribution in the interval (1/2)*base...(3/2)*base.
+ * It corresponds to default IPv6 settings and is not overridable,
+ * because it is really reasonable choice.
+ */
+
+unsigned long neigh_rand_reach_time(unsigned long base)
+{
+ return (base ? (net_random() % base) + (base >> 1) : 0);
+}
+EXPORT_SYMBOL(neigh_rand_reach_time);
+
+
+static int neigh_forced_gc(struct neigh_table *tbl)
+{
+ int shrunk = 0;
+ int i;
+
+ NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
+
+ write_lock_bh(&tbl->lock);
+ for (i = 0; i <= tbl->hash_mask; i++) {
+ struct neighbour *n, **np;
+
+ np = &tbl->hash_buckets[i];
+ while ((n = *np) != NULL) {
+ /* Neighbour record may be discarded if:
+ * - nobody refers to it.
+ * - it is not permanent
+ */
+ write_lock(&n->lock);
+ if (atomic_read(&n->refcnt) == 1 &&
+ !(n->nud_state & NUD_PERMANENT)) {
+ *np = n->next;
+ n->dead = 1;
+ shrunk = 1;
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
+ write_unlock(&n->lock);
+ np = &n->next;
+ }
+ }
+
+ tbl->last_flush = jiffies;
+
+ write_unlock_bh(&tbl->lock);
+
+ return shrunk;
+}
+
+static void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+ neigh_hold(n);
+ if (unlikely(mod_timer(&n->timer, when))) {
+ printk("NEIGH: BUG, double timer add, state is %x\n",
+ n->nud_state);
+ dump_stack();
+ }
+}
+
+static int neigh_del_timer(struct neighbour *n)
+{
+ if ((n->nud_state & NUD_IN_TIMER) &&
+ del_timer(&n->timer)) {
+ neigh_release(n);
+ return 1;
+ }
+ return 0;
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(list)) != NULL) {
+ dev_put(skb->dev);
+ kfree_skb(skb);
+ }
+}
+
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+{
+ int i;
+
+ for (i = 0; i <= tbl->hash_mask; i++) {
+ struct neighbour *n, **np = &tbl->hash_buckets[i];
+
+ while ((n = *np) != NULL) {
+ if (dev && n->dev != dev) {
+ np = &n->next;
+ continue;
+ }
+ *np = n->next;
+ write_lock(&n->lock);
+ neigh_del_timer(n);
+ n->dead = 1;
+
+ if (atomic_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ We must destroy neighbour entry,
+ but someone still uses it.
+
+ The destroy will be delayed until
+ the last user releases us, but
+ we must kill timers etc. and move
+ it to safe state.
+ */
+ skb_queue_purge(&n->arp_queue);
+ n->output = neigh_blackhole;
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ NEIGH_PRINTK2("neigh %p is stray.\n", n);
+ }
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ }
+ }
+}
+
+void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ write_unlock_bh(&tbl->lock);
+}
+EXPORT_SYMBOL(neigh_changeaddr);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ pneigh_ifdown(tbl, dev);
+ write_unlock_bh(&tbl->lock);
+
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_ifdown);
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+{
+ struct neighbour *n = NULL;
+ unsigned long now = jiffies;
+ int entries;
+
+ entries = atomic_inc_return(&tbl->entries) - 1;
+ if (entries >= tbl->gc_thresh3 ||
+ (entries >= tbl->gc_thresh2 &&
+ time_after(now, tbl->last_flush + 5 * HZ))) {
+ if (!neigh_forced_gc(tbl) &&
+ entries >= tbl->gc_thresh3)
+ goto out_entries;
+ }
+
+ n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+ if (!n)
+ goto out_entries;
+
+ skb_queue_head_init(&n->arp_queue);
+ rwlock_init(&n->lock);
+ n->updated = n->used = now;
+ n->nud_state = NUD_NONE;
+ n->output = neigh_blackhole;
+ n->parms = neigh_parms_clone(&tbl->parms);
+ setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
+
+ NEIGH_CACHE_STAT_INC(tbl, allocs);
+ n->tbl = tbl;
+ atomic_set(&n->refcnt, 1);
+ n->dead = 1;
+out:
+ return n;
+
+out_entries:
+ atomic_dec(&tbl->entries);
+ goto out;
+}
+
+static struct neighbour **neigh_hash_alloc(unsigned int entries)
+{
+ unsigned long size = entries * sizeof(struct neighbour *);
+ struct neighbour **ret;
+
+ if (size <= PAGE_SIZE) {
+ ret = kzalloc(size, GFP_ATOMIC);
+ } else {
+ ret = (struct neighbour **)
+ __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
+ }
+ return ret;
+}
+
+static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+{
+ unsigned long size = entries * sizeof(struct neighbour *);
+
+ if (size <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long)hash, get_order(size));
+}
+
+static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
+{
+ struct neighbour **new_hash, **old_hash;
+ unsigned int i, new_hash_mask, old_entries;
+
+ NEIGH_CACHE_STAT_INC(tbl, hash_grows);
+
+ BUG_ON(!is_power_of_2(new_entries));
+ new_hash = neigh_hash_alloc(new_entries);
+ if (!new_hash)
+ return;
+
+ old_entries = tbl->hash_mask + 1;
+ new_hash_mask = new_entries - 1;
+ old_hash = tbl->hash_buckets;
+
+ get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+ for (i = 0; i < old_entries; i++) {
+ struct neighbour *n, *next;
+
+ for (n = old_hash[i]; n; n = next) {
+ unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
+
+ hash_val &= new_hash_mask;
+ next = n->next;
+
+ n->next = new_hash[hash_val];
+ new_hash[hash_val] = n;
+ }
+ }
+ tbl->hash_buckets = new_hash;
+ tbl->hash_mask = new_hash_mask;
+
+ neigh_hash_free(old_hash, old_entries);
+}
+
+struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev)
+{
+ struct neighbour *n;
+ int key_len = tbl->key_len;
+ u32 hash_val;
+
+ NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+ read_lock_bh(&tbl->lock);
+ hash_val = tbl->hash(pkey, dev);
+ for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+ if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
+ neigh_hold(n);
+ NEIGH_CACHE_STAT_INC(tbl, hits);
+ break;
+ }
+ }
+ read_unlock_bh(&tbl->lock);
+ return n;
+}
+EXPORT_SYMBOL(neigh_lookup);
+
+struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
+ const void *pkey)
+{
+ struct neighbour *n;
+ int key_len = tbl->key_len;
+ u32 hash_val;
+
+ NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+ read_lock_bh(&tbl->lock);
+ hash_val = tbl->hash(pkey, NULL);
+ for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+ if (!memcmp(n->primary_key, pkey, key_len) &&
+ net_eq(dev_net(n->dev), net)) {
+ neigh_hold(n);
+ NEIGH_CACHE_STAT_INC(tbl, hits);
+ break;
+ }
+ }
+ read_unlock_bh(&tbl->lock);
+ return n;
+}
+EXPORT_SYMBOL(neigh_lookup_nodev);
+
+struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev)
+{
+ u32 hash_val;
+ int key_len = tbl->key_len;
+ int error;
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+
+ if (!n) {
+ rc = ERR_PTR(-ENOBUFS);
+ goto out;
+ }
+
+ memcpy(n->primary_key, pkey, key_len);
+ n->dev = dev;
+ dev_hold(dev);
+
+ /* Protocol specific setup. */
+ if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+
+ /* Device specific setup. */
+ if (n->parms->neigh_setup &&
+ (error = n->parms->neigh_setup(n)) < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+
+ n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
+
+ write_lock_bh(&tbl->lock);
+
+ if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
+ neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+
+ hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
+
+ if (n->parms->dead) {
+ rc = ERR_PTR(-EINVAL);
+ goto out_tbl_unlock;
+ }
+
+ for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+ if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
+ neigh_hold(n1);
+ rc = n1;
+ goto out_tbl_unlock;
+ }
+ }
+
+ n->next = tbl->hash_buckets[hash_val];
+ tbl->hash_buckets[hash_val] = n;
+ n->dead = 0;
+ neigh_hold(n);
+ write_unlock_bh(&tbl->lock);
+ NEIGH_PRINTK2("neigh %p is created.\n", n);
+ rc = n;
+out:
+ return rc;
+out_tbl_unlock:
+ write_unlock_bh(&tbl->lock);
+out_neigh_release:
+ neigh_release(n);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_create);
+
+static u32 pneigh_hash(const void *pkey, int key_len)
+{
+ u32 hash_val = *(u32 *)(pkey + key_len - 4);
+ hash_val ^= (hash_val >> 16);
+ hash_val ^= hash_val >> 8;
+ hash_val ^= hash_val >> 4;
+ hash_val &= PNEIGH_HASHMASK;
+ return hash_val;
+}
+
+static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
+ struct net *net,
+ const void *pkey,
+ int key_len,
+ struct net_device *dev)
+{
+ while (n) {
+ if (!memcmp(n->key, pkey, key_len) &&
+ net_eq(pneigh_net(n), net) &&
+ (n->dev == dev || !n->dev))
+ return n;
+ n = n->next;
+ }
+ return NULL;
+}
+
+struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey, struct net_device *dev)
+{
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+}
+EXPORT_SYMBOL_GPL(__pneigh_lookup);
+
+struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey,
+ struct net_device *dev, int creat)
+{
+ struct pneigh_entry *n;
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ read_lock_bh(&tbl->lock);
+ n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+ read_unlock_bh(&tbl->lock);
+
+ if (n || !creat)
+ goto out;
+
+ ASSERT_RTNL();
+
+ n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ if (!n)
+ goto out;
+
+#ifdef CONFIG_NET_NS
+ n->net = hold_net(net);
+#endif
+ memcpy(n->key, pkey, key_len);
+ n->dev = dev;
+ if (dev)
+ dev_hold(dev);
+
+ if (tbl->pconstructor && tbl->pconstructor(n)) {
+ if (dev)
+ dev_put(dev);
+ release_net(net);
+ kfree(n);
+ n = NULL;
+ goto out;
+ }
+
+ write_lock_bh(&tbl->lock);
+ n->next = tbl->phash_buckets[hash_val];
+ tbl->phash_buckets[hash_val] = n;
+ write_unlock_bh(&tbl->lock);
+out:
+ return n;
+}
+EXPORT_SYMBOL(pneigh_lookup);
+
+
+int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
+ struct net_device *dev)
+{
+ struct pneigh_entry *n, **np;
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ write_lock_bh(&tbl->lock);
+ for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+ np = &n->next) {
+ if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
+ net_eq(pneigh_net(n), net)) {
+ *np = n->next;
+ write_unlock_bh(&tbl->lock);
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ release_net(pneigh_net(n));
+ kfree(n);
+ return 0;
+ }
+ }
+ write_unlock_bh(&tbl->lock);
+ return -ENOENT;
+}
+
+static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ struct pneigh_entry *n, **np;
+ u32 h;
+
+ for (h = 0; h <= PNEIGH_HASHMASK; h++) {
+ np = &tbl->phash_buckets[h];
+ while ((n = *np) != NULL) {
+ if (!dev || n->dev == dev) {
+ *np = n->next;
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ release_net(pneigh_net(n));
+ kfree(n);
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+ return -ENOENT;
+}
+
+static void neigh_parms_destroy(struct neigh_parms *parms);
+
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+ if (atomic_dec_and_test(&parms->refcnt))
+ neigh_parms_destroy(parms);
+}
+
+/*
+ * neighbour must already be out of the table;
+ *
+ */
+void neigh_destroy(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
+
+ if (!neigh->dead) {
+ printk(KERN_WARNING
+ "Destroying alive neighbour %p\n", neigh);
+ dump_stack();
+ return;
+ }
+
+ if (neigh_del_timer(neigh))
+ printk(KERN_WARNING "Impossible event.\n");
+
+ while ((hh = neigh->hh) != NULL) {
+ neigh->hh = hh->hh_next;
+ hh->hh_next = NULL;
+
+ write_seqlock_bh(&hh->hh_lock);
+ hh->hh_output = neigh_blackhole;
+ write_sequnlock_bh(&hh->hh_lock);
+ if (atomic_dec_and_test(&hh->hh_refcnt))
+ kfree(hh);
+ }
+
+ skb_queue_purge(&neigh->arp_queue);
+
+ dev_put(neigh->dev);
+ neigh_parms_put(neigh->parms);
+
+ NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
+
+ atomic_dec(&neigh->tbl->entries);
+ kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+}
+EXPORT_SYMBOL(neigh_destroy);
+
+/* Neighbour state is suspicious;
+ disable fast path.
+
+ Called with write_locked neigh.
+ */
+static void neigh_suspect(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+
+ NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+
+ neigh->output = neigh->ops->output;
+
+ for (hh = neigh->hh; hh; hh = hh->hh_next)
+ hh->hh_output = neigh->ops->output;
+}
+
+/* Neighbour state is OK;
+ enable fast path.
+
+ Called with write_locked neigh.
+ */
+static void neigh_connect(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+
+ NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
+
+ neigh->output = neigh->ops->connected_output;
+
+ for (hh = neigh->hh; hh; hh = hh->hh_next)
+ hh->hh_output = neigh->ops->hh_output;
+}
+
+static void neigh_periodic_timer(unsigned long arg)
+{
+ struct neigh_table *tbl = (struct neigh_table *)arg;
+ struct neighbour *n, **np;
+ unsigned long expire, now = jiffies;
+
+ NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
+
+ write_lock(&tbl->lock);
+
+ /*
+ * periodically recompute ReachableTime from random function
+ */
+
+ if (time_after(now, tbl->last_rand + 300 * HZ)) {
+ struct neigh_parms *p;
+ tbl->last_rand = now;
+ for (p = &tbl->parms; p; p = p->next)
+ p->reachable_time =
+ neigh_rand_reach_time(p->base_reachable_time);
+ }
+
+ np = &tbl->hash_buckets[tbl->hash_chain_gc];
+ tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
+
+ while ((n = *np) != NULL) {
+ unsigned int state;
+
+ write_lock(&n->lock);
+
+ state = n->nud_state;
+ if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ write_unlock(&n->lock);
+ goto next_elt;
+ }
+
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
+
+ if (atomic_read(&n->refcnt) == 1 &&
+ (state == NUD_FAILED ||
+ time_after(now, n->used + n->parms->gc_staletime))) {
+ *np = n->next;
+ n->dead = 1;
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
+ write_unlock(&n->lock);
+
+next_elt:
+ np = &n->next;
+ }
+
+ /* Cycle through all hash buckets every base_reachable_time/2 ticks.
+ * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
+ * base_reachable_time.
+ */
+ expire = tbl->parms.base_reachable_time >> 1;
+ expire /= (tbl->hash_mask + 1);
+ if (!expire)
+ expire = 1;
+
+ if (expire>HZ)
+ mod_timer(&tbl->gc_timer, round_jiffies(now + expire));
+ else
+ mod_timer(&tbl->gc_timer, now + expire);
+
+ write_unlock(&tbl->lock);
+}
+
+static __inline__ int neigh_max_probes(struct neighbour *n)
+{
+ struct neigh_parms *p = n->parms;
+ return (n->nud_state & NUD_PROBE ?
+ p->ucast_probes :
+ p->ucast_probes + p->app_probes + p->mcast_probes);
+}
+
+/* Called when a timer expires for a neighbour entry. */
+
+static void neigh_timer_handler(unsigned long arg)
+{
+ unsigned long now, next;
+ struct neighbour *neigh = (struct neighbour *)arg;
+ unsigned state;
+ int notify = 0;
+
+ write_lock(&neigh->lock);
+
+ state = neigh->nud_state;
+ now = jiffies;
+ next = now + HZ;
+
+ if (!(state & NUD_IN_TIMER)) {
+#ifndef CONFIG_SMP
+ printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
+#endif
+ goto out;
+ }
+
+ if (state & NUD_REACHABLE) {
+ if (time_before_eq(now,
+ neigh->confirmed + neigh->parms->reachable_time)) {
+ NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
+ next = neigh->confirmed + neigh->parms->reachable_time;
+ } else if (time_before_eq(now,
+ neigh->used + neigh->parms->delay_probe_time)) {
+ NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
+ neigh_suspect(neigh);
+ next = now + neigh->parms->delay_probe_time;
+ } else {
+ NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+ neigh->nud_state = NUD_STALE;
+ neigh->updated = jiffies;
+ neigh_suspect(neigh);
+ notify = 1;
+ }
+ } else if (state & NUD_DELAY) {
+ if (time_before_eq(now,
+ neigh->confirmed + neigh->parms->delay_probe_time)) {
+ NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh);
+ neigh->nud_state = NUD_REACHABLE;
+ neigh->updated = jiffies;
+ neigh_connect(neigh);
+ notify = 1;
+ next = neigh->confirmed + neigh->parms->reachable_time;
+ } else {
+ NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
+ neigh->nud_state = NUD_PROBE;
+ neigh->updated = jiffies;
+ atomic_set(&neigh->probes, 0);
+ next = now + neigh->parms->retrans_time;
+ }
+ } else {
+ /* NUD_PROBE|NUD_INCOMPLETE */
+ next = now + neigh->parms->retrans_time;
+ }
+
+ if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
+ struct sk_buff *skb;
+
+ neigh->nud_state = NUD_FAILED;
+ neigh->updated = jiffies;
+ notify = 1;
+ NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
+ NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
+
+ /* It is very thin place. report_unreachable is very complicated
+ routine. Particularly, it can hit the same neighbour entry!
+
+ So that, we try to be accurate and avoid dead loop. --ANK
+ */
+ while (neigh->nud_state == NUD_FAILED &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ write_unlock(&neigh->lock);
+ neigh->ops->error_report(neigh, skb);
+ write_lock(&neigh->lock);
+ }
+ skb_queue_purge(&neigh->arp_queue);
+ }
+
+ if (neigh->nud_state & NUD_IN_TIMER) {
+ if (time_before(next, jiffies + HZ/2))
+ next = jiffies + HZ/2;
+ if (!mod_timer(&neigh->timer, next))
+ neigh_hold(neigh);
+ }
+ if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
+ struct sk_buff *skb = skb_peek(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_copy(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ if (skb)
+ kfree_skb(skb);
+ } else {
+out:
+ write_unlock(&neigh->lock);
+ }
+
+ if (notify)
+ neigh_update_notify(neigh);
+
+ neigh_release(neigh);
+}
+
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+{
+ int rc;
+ unsigned long now;
+
+ write_lock_bh(&neigh->lock);
+
+ rc = 0;
+ if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
+ goto out_unlock_bh;
+
+ now = jiffies;
+
+ if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
+ if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
+ atomic_set(&neigh->probes, neigh->parms->ucast_probes);
+ neigh->nud_state = NUD_INCOMPLETE;
+ neigh->updated = jiffies;
+ neigh_add_timer(neigh, now + 1);
+ } else {
+ neigh->nud_state = NUD_FAILED;
+ neigh->updated = jiffies;
+ write_unlock_bh(&neigh->lock);
+
+ if (skb)
+ kfree_skb(skb);
+ return 1;
+ }
+ } else if (neigh->nud_state & NUD_STALE) {
+ NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
+ neigh_add_timer(neigh,
+ jiffies + neigh->parms->delay_probe_time);
+ }
+
+ if (neigh->nud_state == NUD_INCOMPLETE) {
+ if (skb) {
+ if (skb_queue_len(&neigh->arp_queue) >=
+ neigh->parms->queue_len) {
+ struct sk_buff *buff;
+ buff = __skb_dequeue(&neigh->arp_queue);
+ kfree_skb(buff);
+ NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
+ }
+ __skb_queue_tail(&neigh->arp_queue, skb);
+ }
+ rc = 1;
+ }
+out_unlock_bh:
+ write_unlock_bh(&neigh->lock);
+ return rc;
+}
+EXPORT_SYMBOL(__neigh_event_send);
+
+static void neigh_update_hhs(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+ void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
+ = neigh->dev->header_ops->cache_update;
+
+ if (update) {
+ for (hh = neigh->hh; hh; hh = hh->hh_next) {
+ write_seqlock_bh(&hh->hh_lock);
+ update(hh, neigh->dev, neigh->ha);
+ write_sequnlock_bh(&hh->hh_lock);
+ }
+ }
+}
+
+
+
+/* Generic update routine.
+ -- lladdr is new lladdr or NULL, if it is not supplied.
+ -- new is new state.
+ -- flags
+ NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
+ if it is different.
+ NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
+ lladdr instead of overriding it
+ if it is different.
+ It also allows to retain current state
+ if lladdr is unchanged.
+ NEIGH_UPDATE_F_ADMIN means that the change is administrative.
+
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+ NTF_ROUTER flag.
+ NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
+ a router.
+
+ Caller MUST hold reference count on the entry.
+ */
+
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+ u32 flags)
+{
+ u8 old;
+ int err;
+ int notify = 0;
+ struct net_device *dev;
+ int update_isrouter = 0;
+
+ write_lock_bh(&neigh->lock);
+
+ dev = neigh->dev;
+ old = neigh->nud_state;
+ err = -EPERM;
+
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ (old & (NUD_NOARP | NUD_PERMANENT)))
+ goto out;
+
+ if (!(new & NUD_VALID)) {
+ neigh_del_timer(neigh);
+ if (old & NUD_CONNECTED)
+ neigh_suspect(neigh);
+ neigh->nud_state = new;
+ err = 0;
+ notify = old & NUD_VALID;
+ goto out;
+ }
+
+ /* Compare new lladdr with cached one */
+ if (!dev->addr_len) {
+ /* First case: device needs no address. */
+ lladdr = neigh->ha;
+ } else if (lladdr) {
+ /* The second case: if something is already cached
+ and a new address is proposed:
+ - compare new & old
+ - if they are different, check override flag
+ */
+ if ((old & NUD_VALID) &&
+ !memcmp(lladdr, neigh->ha, dev->addr_len))
+ lladdr = neigh->ha;
+ } else {
+ /* No address is supplied; if we know something,
+ use it, otherwise discard the request.
+ */
+ err = -EINVAL;
+ if (!(old & NUD_VALID))
+ goto out;
+ lladdr = neigh->ha;
+ }
+
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+ neigh->updated = jiffies;
+
+ /* If entry was valid and address is not changed,
+ do not change entry state, if new one is STALE.
+ */
+ err = 0;
+ update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
+ if (old & NUD_VALID) {
+ if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
+ update_isrouter = 0;
+ if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
+ (old & NUD_CONNECTED)) {
+ lladdr = neigh->ha;
+ new = NUD_STALE;
+ } else
+ goto out;
+ } else {
+ if (lladdr == neigh->ha && new == NUD_STALE &&
+ ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
+ (old & NUD_CONNECTED))
+ )
+ new = old;
+ }
+ }
+
+ if (new != old) {
+ neigh_del_timer(neigh);
+ if (new & NUD_IN_TIMER)
+ neigh_add_timer(neigh, (jiffies +
+ ((new & NUD_REACHABLE) ?
+ neigh->parms->reachable_time :
+ 0)));
+ neigh->nud_state = new;
+ }
+
+ if (lladdr != neigh->ha) {
+ memcpy(&neigh->ha, lladdr, dev->addr_len);
+ neigh_update_hhs(neigh);
+ if (!(new & NUD_CONNECTED))
+ neigh->confirmed = jiffies -
+ (neigh->parms->base_reachable_time << 1);
+ notify = 1;
+ }
+ if (new == old)
+ goto out;
+ if (new & NUD_CONNECTED)
+ neigh_connect(neigh);
+ else
+ neigh_suspect(neigh);
+ if (!(old & NUD_VALID)) {
+ struct sk_buff *skb;
+
+ /* Again: avoid dead loop if something went wrong */
+
+ while (neigh->nud_state & NUD_VALID &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct neighbour *n1 = neigh;
+ write_unlock_bh(&neigh->lock);
+ /* On shaper/eql skb->dst->neighbour != neigh :( */
+ if (skb->dst && skb->dst->neighbour)
+ n1 = skb->dst->neighbour;
+ n1->output(skb);
+ write_lock_bh(&neigh->lock);
+ }
+ skb_queue_purge(&neigh->arp_queue);
+ }
+out:
+ if (update_isrouter) {
+ neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
+ (neigh->flags | NTF_ROUTER) :
+ (neigh->flags & ~NTF_ROUTER);
+ }
+ write_unlock_bh(&neigh->lock);
+
+ if (notify)
+ neigh_update_notify(neigh);
+
+ return err;
+}
+EXPORT_SYMBOL(neigh_update);
+
+struct neighbour *neigh_event_ns(struct neigh_table *tbl,
+ u8 *lladdr, void *saddr,
+ struct net_device *dev)
+{
+ struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
+ lladdr || !dev->addr_len);
+ if (neigh)
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_OVERRIDE);
+ return neigh;
+}
+EXPORT_SYMBOL(neigh_event_ns);
+
+static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
+ __be16 protocol)
+{
+ struct hh_cache *hh;
+ struct net_device *dev = dst->dev;
+
+ for (hh = n->hh; hh; hh = hh->hh_next)
+ if (hh->hh_type == protocol)
+ break;
+
+ if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
+ seqlock_init(&hh->hh_lock);
+ hh->hh_type = protocol;
+ atomic_set(&hh->hh_refcnt, 0);
+ hh->hh_next = NULL;
+
+ if (dev->header_ops->cache(n, hh)) {
+ kfree(hh);
+ hh = NULL;
+ } else {
+ atomic_inc(&hh->hh_refcnt);
+ hh->hh_next = n->hh;
+ n->hh = hh;
+ if (n->nud_state & NUD_CONNECTED)
+ hh->hh_output = n->ops->hh_output;
+ else
+ hh->hh_output = n->ops->output;
+ }
+ }
+ if (hh) {
+ atomic_inc(&hh->hh_refcnt);
+ dst->hh = hh;
+ }
+}
+
+/* This function can be used in contexts, where only old dev_queue_xmit
+ worked, f.e. if you want to override normal output path (eql, shaper),
+ but resolution is not made yet.
+ */
+
+int neigh_compat_output(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+
+ __skb_pull(skb, skb_network_offset(skb));
+
+ if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
+ skb->len) < 0 &&
+ dev->header_ops->rebuild(skb))
+ return 0;
+
+ return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_compat_output);
+
+/* Slow and careful. */
+
+int neigh_resolve_output(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct neighbour *neigh;
+ int rc = 0;
+
+ if (!dst || !(neigh = dst->neighbour))
+ goto discard;
+
+ __skb_pull(skb, skb_network_offset(skb));
+
+ if (!neigh_event_send(neigh, skb)) {
+ int err;
+ struct net_device *dev = neigh->dev;
+ if (dev->header_ops->cache && !dst->hh) {
+ write_lock_bh(&neigh->lock);
+ if (!dst->hh)
+ neigh_hh_init(neigh, dst, dst->ops->protocol);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ write_unlock_bh(&neigh->lock);
+ } else {
+ read_lock_bh(&neigh->lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ read_unlock_bh(&neigh->lock);
+ }
+ if (err >= 0)
+ rc = neigh->ops->queue_xmit(skb);
+ else
+ goto out_kfree_skb;
+ }
+out:
+ return rc;
+discard:
+ NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
+ dst, dst ? dst->neighbour : NULL);
+out_kfree_skb:
+ rc = -EINVAL;
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_resolve_output);
+
+/* As fast as possible without hh cache */
+
+int neigh_connected_output(struct sk_buff *skb)
+{
+ int err;
+ struct dst_entry *dst = skb->dst;
+ struct neighbour *neigh = dst->neighbour;
+ struct net_device *dev = neigh->dev;
+
+ __skb_pull(skb, skb_network_offset(skb));
+
+ read_lock_bh(&neigh->lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ read_unlock_bh(&neigh->lock);
+ if (err >= 0)
+ err = neigh->ops->queue_xmit(skb);
+ else {
+ err = -EINVAL;
+ kfree_skb(skb);
+ }
+ return err;
+}
+EXPORT_SYMBOL(neigh_connected_output);
+
+static void neigh_proxy_process(unsigned long arg)
+{
+ struct neigh_table *tbl = (struct neigh_table *)arg;
+ long sched_next = 0;
+ unsigned long now = jiffies;
+ struct sk_buff *skb, *n;
+
+ spin_lock(&tbl->proxy_queue.lock);
+
+ skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+ long tdif = NEIGH_CB(skb)->sched_next - now;
+
+ if (tdif <= 0) {
+ struct net_device *dev = skb->dev;
+ __skb_unlink(skb, &tbl->proxy_queue);
+ if (tbl->proxy_redo && netif_running(dev))
+ tbl->proxy_redo(skb);
+ else
+ kfree_skb(skb);
+
+ dev_put(dev);
+ } else if (!sched_next || tdif < sched_next)
+ sched_next = tdif;
+ }
+ del_timer(&tbl->proxy_timer);
+ if (sched_next)
+ mod_timer(&tbl->proxy_timer, jiffies + sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
+}
+
+void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+ unsigned long sched_next = now + (net_random() % p->proxy_delay);
+
+ if (tbl->proxy_queue.qlen > p->proxy_qlen) {
+ kfree_skb(skb);
+ return;
+ }
+
+ NEIGH_CB(skb)->sched_next = sched_next;
+ NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
+
+ spin_lock(&tbl->proxy_queue.lock);
+ if (del_timer(&tbl->proxy_timer)) {
+ if (time_before(tbl->proxy_timer.expires, sched_next))
+ sched_next = tbl->proxy_timer.expires;
+ }
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ dev_hold(skb->dev);
+ __skb_queue_tail(&tbl->proxy_queue, skb);
+ mod_timer(&tbl->proxy_timer, sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
+}
+EXPORT_SYMBOL(pneigh_enqueue);
+
+static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
+ struct net *net, int ifindex)
+{
+ struct neigh_parms *p;
+
+ for (p = &tbl->parms; p; p = p->next) {
+ if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
+ (!p->dev && !ifindex))
+ return p;
+ }
+
+ return NULL;
+}
+
+struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
+ struct neigh_table *tbl)
+{
+ struct neigh_parms *p, *ref;
+ struct net *net;
+
+ net = dev_net(dev);
+ ref = lookup_neigh_params(tbl, net, 0);
+ if (!ref)
+ return NULL;
+
+ p = kmemdup(ref, sizeof(*p), GFP_KERNEL);
+ if (p) {
+ p->tbl = tbl;
+ atomic_set(&p->refcnt, 1);
+ INIT_RCU_HEAD(&p->rcu_head);
+ p->reachable_time =
+ neigh_rand_reach_time(p->base_reachable_time);
+
+ if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
+ kfree(p);
+ return NULL;
+ }
+
+ dev_hold(dev);
+ p->dev = dev;
+#ifdef CONFIG_NET_NS
+ p->net = hold_net(net);
+#endif
+ p->sysctl_table = NULL;
+ write_lock_bh(&tbl->lock);
+ p->next = tbl->parms.next;
+ tbl->parms.next = p;
+ write_unlock_bh(&tbl->lock);
+ }
+ return p;
+}
+EXPORT_SYMBOL(neigh_parms_alloc);
+
+static void neigh_rcu_free_parms(struct rcu_head *head)
+{
+ struct neigh_parms *parms =
+ container_of(head, struct neigh_parms, rcu_head);
+
+ neigh_parms_put(parms);
+}
+
+void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
+{
+ struct neigh_parms **p;
+
+ if (!parms || parms == &tbl->parms)
+ return;
+ write_lock_bh(&tbl->lock);
+ for (p = &tbl->parms.next; *p; p = &(*p)->next) {
+ if (*p == parms) {
+ *p = parms->next;
+ parms->dead = 1;
+ write_unlock_bh(&tbl->lock);
+ if (parms->dev)
+ dev_put(parms->dev);
+ call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
+ return;
+ }
+ }
+ write_unlock_bh(&tbl->lock);
+ NEIGH_PRINTK1("neigh_parms_release: not found\n");
+}
+EXPORT_SYMBOL(neigh_parms_release);
+
+static void neigh_parms_destroy(struct neigh_parms *parms)
+{
+ release_net(neigh_parms_net(parms));
+ kfree(parms);
+}
+
+static struct lock_class_key neigh_table_proxy_queue_class;
+
+void neigh_table_init_no_netlink(struct neigh_table *tbl)
+{
+ unsigned long now = jiffies;
+ unsigned long phsize;
+
+#ifdef CONFIG_NET_NS
+ tbl->parms.net = &init_net;
+#endif
+ atomic_set(&tbl->parms.refcnt, 1);
+ INIT_RCU_HEAD(&tbl->parms.rcu_head);
+ tbl->parms.reachable_time =
+ neigh_rand_reach_time(tbl->parms.base_reachable_time);
+
+ if (!tbl->kmem_cachep)
+ tbl->kmem_cachep =
+ kmem_cache_create(tbl->id, tbl->entry_size, 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+ tbl->stats = alloc_percpu(struct neigh_statistics);
+ if (!tbl->stats)
+ panic("cannot create neighbour cache statistics");
+
+#ifdef CONFIG_PROC_FS
+ tbl->pde = proc_create_data(tbl->id, 0, init_net.proc_net_stat,
+ &neigh_stat_seq_fops, tbl);
+ if (!tbl->pde)
+ panic("cannot create neighbour proc dir entry");
+#endif
+
+ tbl->hash_mask = 1;
+ tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
+
+ phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
+ tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
+
+ if (!tbl->hash_buckets || !tbl->phash_buckets)
+ panic("cannot allocate neighbour cache hashes");
+
+ get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+
+ rwlock_init(&tbl->lock);
+ setup_timer(&tbl->gc_timer, neigh_periodic_timer, (unsigned long)tbl);
+ tbl->gc_timer.expires = now + 1;
+ add_timer(&tbl->gc_timer);
+
+ setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
+ skb_queue_head_init_class(&tbl->proxy_queue,
+ &neigh_table_proxy_queue_class);
+
+ tbl->last_flush = now;
+ tbl->last_rand = now + tbl->parms.reachable_time * 20;
+}
+EXPORT_SYMBOL(neigh_table_init_no_netlink);
+
+void neigh_table_init(struct neigh_table *tbl)
+{
+ struct neigh_table *tmp;
+
+ neigh_table_init_no_netlink(tbl);
+ write_lock(&neigh_tbl_lock);
+ for (tmp = neigh_tables; tmp; tmp = tmp->next) {
+ if (tmp->family == tbl->family)
+ break;
+ }
+ tbl->next = neigh_tables;
+ neigh_tables = tbl;
+ write_unlock(&neigh_tbl_lock);
+
+ if (unlikely(tmp)) {
+ printk(KERN_ERR "NEIGH: Registering multiple tables for "
+ "family %d\n", tbl->family);
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(neigh_table_init);
+
+int neigh_table_clear(struct neigh_table *tbl)
+{
+ struct neigh_table **tp;
+
+ /* It is not clean... Fix it to unload IPv6 module safely */
+ del_timer_sync(&tbl->gc_timer);
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+ neigh_ifdown(tbl, NULL);
+ if (atomic_read(&tbl->entries))
+ printk(KERN_CRIT "neighbour leakage\n");
+ write_lock(&neigh_tbl_lock);
+ for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
+ if (*tp == tbl) {
+ *tp = tbl->next;
+ break;
+ }
+ }
+ write_unlock(&neigh_tbl_lock);
+
+ neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
+ tbl->hash_buckets = NULL;
+
+ kfree(tbl->phash_buckets);
+ tbl->phash_buckets = NULL;
+
+ remove_proc_entry(tbl->id, init_net.proc_net_stat);
+
+ free_percpu(tbl->stats);
+ tbl->stats = NULL;
+
+ kmem_cache_destroy(tbl->kmem_cachep);
+ tbl->kmem_cachep = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(neigh_table_clear);
+
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *dst_attr;
+ struct neigh_table *tbl;
+ struct net_device *dev = NULL;
+ int err = -EINVAL;
+
+ if (nlmsg_len(nlh) < sizeof(*ndm))
+ goto out;
+
+ dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
+ if (dst_attr == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+ }
+
+ read_lock(&neigh_tbl_lock);
+ for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ struct neighbour *neigh;
+
+ if (tbl->family != ndm->ndm_family)
+ continue;
+ read_unlock(&neigh_tbl_lock);
+
+ if (nla_len(dst_attr) < tbl->key_len)
+ goto out_dev_put;
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out_dev_put;
+ }
+
+ if (dev == NULL)
+ goto out_dev_put;
+
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
+ goto out_dev_put;
+ }
+
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ goto out_dev_put;
+ }
+ read_unlock(&neigh_tbl_lock);
+ err = -EAFNOSUPPORT;
+
+out_dev_put:
+ if (dev)
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct neigh_table *tbl;
+ struct net_device *dev = NULL;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ goto out;
+
+ err = -EINVAL;
+ if (tb[NDA_DST] == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+ goto out_dev_put;
+ }
+
+ read_lock(&neigh_tbl_lock);
+ for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
+
+ if (tbl->family != ndm->ndm_family)
+ continue;
+ read_unlock(&neigh_tbl_lock);
+
+ if (nla_len(tb[NDA_DST]) < tbl->key_len)
+ goto out_dev_put;
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
+ err = -ENOBUFS;
+ pn = pneigh_lookup(tbl, net, dst, dev, 1);
+ if (pn) {
+ pn->flags = ndm->ndm_flags;
+ err = 0;
+ }
+ goto out_dev_put;
+ }
+
+ if (dev == NULL)
+ goto out_dev_put;
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
+ goto out_dev_put;
+ }
+
+ neigh = __neigh_lookup_errno(tbl, dst, dev);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out_dev_put;
+ }
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
+ goto out_dev_put;
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ }
+
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+ neigh_release(neigh);
+ goto out_dev_put;
+ }
+
+ read_unlock(&neigh_tbl_lock);
+ err = -EAFNOSUPPORT;
+
+out_dev_put:
+ if (dev)
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NDTA_PARMS);
+ if (nest == NULL)
+ return -ENOBUFS;
+
+ if (parms->dev)
+ NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
+
+ NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
+ NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+ NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
+ NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
+ NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
+ NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
+ NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
+ NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
+ parms->base_reachable_time);
+ NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
+ NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
+ NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
+ NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
+ NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
+ NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
+ u32 pid, u32 seq, int type, int flags)
+{
+ struct nlmsghdr *nlh;
+ struct ndtmsg *ndtmsg;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndtmsg = nlmsg_data(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ NLA_PUT_STRING(skb, NDTA_NAME, tbl->id);
+ NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
+ NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
+ NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
+ NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
+
+ {
+ unsigned long now = jiffies;
+ unsigned int flush_delta = now - tbl->last_flush;
+ unsigned int rand_delta = now - tbl->last_rand;
+
+ struct ndt_config ndc = {
+ .ndtc_key_len = tbl->key_len,
+ .ndtc_entry_size = tbl->entry_size,
+ .ndtc_entries = atomic_read(&tbl->entries),
+ .ndtc_last_flush = jiffies_to_msecs(flush_delta),
+ .ndtc_last_rand = jiffies_to_msecs(rand_delta),
+ .ndtc_hash_rnd = tbl->hash_rnd,
+ .ndtc_hash_mask = tbl->hash_mask,
+ .ndtc_hash_chain_gc = tbl->hash_chain_gc,
+ .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ };
+
+ NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+ }
+
+ {
+ int cpu;
+ struct ndt_stats ndst;
+
+ memset(&ndst, 0, sizeof(ndst));
+
+ for_each_possible_cpu(cpu) {
+ struct neigh_statistics *st;
+
+ st = per_cpu_ptr(tbl->stats, cpu);
+ ndst.ndts_allocs += st->allocs;
+ ndst.ndts_destroys += st->destroys;
+ ndst.ndts_hash_grows += st->hash_grows;
+ ndst.ndts_res_failed += st->res_failed;
+ ndst.ndts_lookups += st->lookups;
+ ndst.ndts_hits += st->hits;
+ ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
+ ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
+ ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
+ ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ }
+
+ NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+ }
+
+ BUG_ON(tbl->parms.dev);
+ if (neightbl_fill_parms(skb, &tbl->parms) < 0)
+ goto nla_put_failure;
+
+ read_unlock_bh(&tbl->lock);
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ read_unlock_bh(&tbl->lock);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_param_info(struct sk_buff *skb,
+ struct neigh_table *tbl,
+ struct neigh_parms *parms,
+ u32 pid, u32 seq, int type,
+ unsigned int flags)
+{
+ struct ndtmsg *ndtmsg;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndtmsg = nlmsg_data(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
+ neightbl_fill_parms(skb, parms) < 0)
+ goto errout;
+
+ read_unlock_bh(&tbl->lock);
+ return nlmsg_end(skb, nlh);
+errout:
+ read_unlock_bh(&tbl->lock);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
+ [NDTA_NAME] = { .type = NLA_STRING },
+ [NDTA_THRESH1] = { .type = NLA_U32 },
+ [NDTA_THRESH2] = { .type = NLA_U32 },
+ [NDTA_THRESH3] = { .type = NLA_U32 },
+ [NDTA_GC_INTERVAL] = { .type = NLA_U64 },
+ [NDTA_PARMS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
+ [NDTPA_IFINDEX] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
+ [NDTPA_APP_PROBES] = { .type = NLA_U32 },
+ [NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
+ [NDTPA_GC_STALETIME] = { .type = NLA_U64 },
+ [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
+ [NDTPA_RETRANS_TIME] = { .type = NLA_U64 },
+ [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
+ [NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
+ [NDTPA_LOCKTIME] = { .type = NLA_U64 },
+};
+
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct neigh_table *tbl;
+ struct ndtmsg *ndtmsg;
+ struct nlattr *tb[NDTA_MAX+1];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+ nl_neightbl_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[NDTA_NAME] == NULL) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ ndtmsg = nlmsg_data(nlh);
+ read_lock(&neigh_tbl_lock);
+ for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+ continue;
+
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
+ break;
+ }
+
+ if (tbl == NULL) {
+ err = -ENOENT;
+ goto errout_locked;
+ }
+
+ /*
+ * We acquire tbl->lock to be nice to the periodic timers and
+ * make sure they always see a consistent set of values.
+ */
+ write_lock_bh(&tbl->lock);
+
+ if (tb[NDTA_PARMS]) {
+ struct nlattr *tbp[NDTPA_MAX+1];
+ struct neigh_parms *p;
+ int i, ifindex = 0;
+
+ err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
+ nl_ntbl_parm_policy);
+ if (err < 0)
+ goto errout_tbl_lock;
+
+ if (tbp[NDTPA_IFINDEX])
+ ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
+
+ p = lookup_neigh_params(tbl, net, ifindex);
+ if (p == NULL) {
+ err = -ENOENT;
+ goto errout_tbl_lock;
+ }
+
+ for (i = 1; i <= NDTPA_MAX; i++) {
+ if (tbp[i] == NULL)
+ continue;
+
+ switch (i) {
+ case NDTPA_QUEUE_LEN:
+ p->queue_len = nla_get_u32(tbp[i]);
+ break;
+ case NDTPA_PROXY_QLEN:
+ p->proxy_qlen = nla_get_u32(tbp[i]);
+ break;
+ case NDTPA_APP_PROBES:
+ p->app_probes = nla_get_u32(tbp[i]);
+ break;
+ case NDTPA_UCAST_PROBES:
+ p->ucast_probes = nla_get_u32(tbp[i]);
+ break;
+ case NDTPA_MCAST_PROBES:
+ p->mcast_probes = nla_get_u32(tbp[i]);
+ break;
+ case NDTPA_BASE_REACHABLE_TIME:
+ p->base_reachable_time = nla_get_msecs(tbp[i]);
+ break;
+ case NDTPA_GC_STALETIME:
+ p->gc_staletime = nla_get_msecs(tbp[i]);
+ break;
+ case NDTPA_DELAY_PROBE_TIME:
+ p->delay_probe_time = nla_get_msecs(tbp[i]);
+ break;
+ case NDTPA_RETRANS_TIME:
+ p->retrans_time = nla_get_msecs(tbp[i]);
+ break;
+ case NDTPA_ANYCAST_DELAY:
+ p->anycast_delay = nla_get_msecs(tbp[i]);
+ break;
+ case NDTPA_PROXY_DELAY:
+ p->proxy_delay = nla_get_msecs(tbp[i]);
+ break;
+ case NDTPA_LOCKTIME:
+ p->locktime = nla_get_msecs(tbp[i]);
+ break;
+ }
+ }
+ }
+
+ if (tb[NDTA_THRESH1])
+ tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+
+ if (tb[NDTA_THRESH2])
+ tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+
+ if (tb[NDTA_THRESH3])
+ tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+
+ if (tb[NDTA_GC_INTERVAL])
+ tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+
+ err = 0;
+
+errout_tbl_lock:
+ write_unlock_bh(&tbl->lock);
+errout_locked:
+ read_unlock(&neigh_tbl_lock);
+errout:
+ return err;
+}
+
+static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int family, tidx, nidx = 0;
+ int tbl_skip = cb->args[0];
+ int neigh_skip = cb->args[1];
+ struct neigh_table *tbl;
+
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ read_lock(&neigh_tbl_lock);
+ for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+ struct neigh_parms *p;
+
+ if (tidx < tbl_skip || (family && tbl->family != family))
+ continue;
+
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) <= 0)
+ break;
+
+ for (nidx = 0, p = tbl->parms.next; p; p = p->next) {
+ if (!net_eq(neigh_parms_net(p), net))
+ continue;
+
+ if (nidx++ < neigh_skip)
+ continue;
+
+ if (neightbl_fill_param_info(skb, tbl, p,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) <= 0)
+ goto out;
+ }
+
+ neigh_skip = 0;
+ }
+out:
+ read_unlock(&neigh_tbl_lock);
+ cb->args[0] = tidx;
+ cb->args[1] = nidx;
+
+ return skb->len;
+}
+
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
+{
+ unsigned long now = jiffies;
+ struct nda_cacheinfo ci;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = neigh->ops->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = neigh->flags;
+ ndm->ndm_type = neigh->type;
+ ndm->ndm_ifindex = neigh->dev->ifindex;
+
+ NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
+
+ read_lock_bh(&neigh->lock);
+ ndm->ndm_state = neigh->nud_state;
+ if ((neigh->nud_state & NUD_VALID) &&
+ nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
+ read_unlock_bh(&neigh->lock);
+ goto nla_put_failure;
+ }
+
+ ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
+ ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+ ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
+ ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
+ read_unlock_bh(&neigh->lock);
+
+ NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
+ NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void neigh_update_notify(struct neighbour *neigh)
+{
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0);
+}
+
+static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net * net = sock_net(skb->sk);
+ struct neighbour *n;
+ int rc, h, s_h = cb->args[1];
+ int idx, s_idx = idx = cb->args[2];
+
+ read_lock_bh(&tbl->lock);
+ for (h = 0; h <= tbl->hash_mask; h++) {
+ if (h < s_h)
+ continue;
+ if (h > s_h)
+ s_idx = 0;
+ for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) {
+ int lidx;
+ if (dev_net(n->dev) != net)
+ continue;
+ lidx = idx++;
+ if (lidx < s_idx)
+ continue;
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI) <= 0) {
+ read_unlock_bh(&tbl->lock);
+ rc = -1;
+ goto out;
+ }
+ }
+ }
+ read_unlock_bh(&tbl->lock);
+ rc = skb->len;
+out:
+ cb->args[1] = h;
+ cb->args[2] = idx;
+ return rc;
+}
+
+static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct neigh_table *tbl;
+ int t, family, s_t;
+
+ read_lock(&neigh_tbl_lock);
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+ s_t = cb->args[0];
+
+ for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+ if (t < s_t || (family && tbl->family != family))
+ continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args) -
+ sizeof(cb->args[0]));
+ if (neigh_dump_table(tbl, skb, cb) < 0)
+ break;
+ }
+ read_unlock(&neigh_tbl_lock);
+
+ cb->args[0] = t;
+ return skb->len;
+}
+
+void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
+{
+ int chain;
+
+ read_lock_bh(&tbl->lock);
+ for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ struct neighbour *n;
+
+ for (n = tbl->hash_buckets[chain]; n; n = n->next)
+ cb(n, cookie);
+ }
+ read_unlock_bh(&tbl->lock);
+}
+EXPORT_SYMBOL(neigh_for_each);
+
+/* The tbl->lock must be held as a writer and BH disabled. */
+void __neigh_for_each_release(struct neigh_table *tbl,
+ int (*cb)(struct neighbour *))
+{
+ int chain;
+
+ for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ struct neighbour *n, **np;
+
+ np = &tbl->hash_buckets[chain];
+ while ((n = *np) != NULL) {
+ int release;
+
+ write_lock(&n->lock);
+ release = cb(n);
+ if (release) {
+ *np = n->next;
+ n->dead = 1;
+ } else
+ np = &n->next;
+ write_unlock(&n->lock);
+ if (release)
+ neigh_cleanup_and_release(n);
+ }
+ }
+}
+EXPORT_SYMBOL(__neigh_for_each_release);
+
+#ifdef CONFIG_PROC_FS
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+ struct neighbour *n = NULL;
+ int bucket = state->bucket;
+
+ state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
+ for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
+ n = tbl->hash_buckets[bucket];
+
+ while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, &fakep);
+ if (!v)
+ goto next;
+ }
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ break;
+ if (n->nud_state & ~NUD_NOARP)
+ break;
+ next:
+ n = n->next;
+ }
+
+ if (n)
+ break;
+ }
+ state->bucket = bucket;
+
+ return n;
+}
+
+static struct neighbour *neigh_get_next(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+
+ if (state->neigh_sub_iter) {
+ void *v = state->neigh_sub_iter(state, n, pos);
+ if (v)
+ return n;
+ }
+ n = n->next;
+
+ while (1) {
+ while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
+ if (state->neigh_sub_iter) {
+ void *v = state->neigh_sub_iter(state, n, pos);
+ if (v)
+ return n;
+ goto next;
+ }
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ break;
+
+ if (n->nud_state & ~NUD_NOARP)
+ break;
+ next:
+ n = n->next;
+ }
+
+ if (n)
+ break;
+
+ if (++state->bucket > tbl->hash_mask)
+ break;
+
+ n = tbl->hash_buckets[state->bucket];
+ }
+
+ if (n && pos)
+ --(*pos);
+ return n;
+}
+
+static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct neighbour *n = neigh_get_first(seq);
+
+ if (n) {
+ --(*pos);
+ while (*pos) {
+ n = neigh_get_next(seq, n, pos);
+ if (!n)
+ break;
+ }
+ }
+ return *pos ? NULL : n;
+}
+
+static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+ struct pneigh_entry *pn = NULL;
+ int bucket = state->bucket;
+
+ state->flags |= NEIGH_SEQ_IS_PNEIGH;
+ for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
+ pn = tbl->phash_buckets[bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
+ if (pn)
+ break;
+ }
+ state->bucket = bucket;
+
+ return pn;
+}
+
+static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
+ struct pneigh_entry *pn,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+
+ pn = pn->next;
+ while (!pn) {
+ if (++state->bucket > PNEIGH_HASHMASK)
+ break;
+ pn = tbl->phash_buckets[state->bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
+ if (pn)
+ break;
+ }
+
+ if (pn && pos)
+ --(*pos);
+
+ return pn;
+}
+
+static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct pneigh_entry *pn = pneigh_get_first(seq);
+
+ if (pn) {
+ --(*pos);
+ while (*pos) {
+ pn = pneigh_get_next(seq, pn, pos);
+ if (!pn)
+ break;
+ }
+ }
+ return *pos ? NULL : pn;
+}
+
+static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ void *rc;
+ loff_t idxpos = *pos;
+
+ rc = neigh_get_idx(seq, &idxpos);
+ if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+ rc = pneigh_get_idx(seq, &idxpos);
+
+ return rc;
+}
+
+void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+ __acquires(tbl->lock)
+{
+ struct neigh_seq_state *state = seq->private;
+
+ state->tbl = tbl;
+ state->bucket = 0;
+ state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
+
+ read_lock_bh(&tbl->lock);
+
+ return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL(neigh_seq_start);
+
+void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct neigh_seq_state *state;
+ void *rc;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = neigh_get_first(seq);
+ goto out;
+ }
+
+ state = seq->private;
+ if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
+ rc = neigh_get_next(seq, v, NULL);
+ if (rc)
+ goto out;
+ if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+ rc = pneigh_get_first(seq);
+ } else {
+ BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
+ rc = pneigh_get_next(seq, v, NULL);
+ }
+out:
+ ++(*pos);
+ return rc;
+}
+EXPORT_SYMBOL(neigh_seq_next);
+
+void neigh_seq_stop(struct seq_file *seq, void *v)
+ __releases(tbl->lock)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct neigh_table *tbl = state->tbl;
+
+ read_unlock_bh(&tbl->lock);
+}
+EXPORT_SYMBOL(neigh_seq_stop);
+
+/* statistics via seq_file */
+
+static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct proc_dir_entry *pde = seq->private;
+ struct neigh_table *tbl = pde->data;
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(tbl->stats, cpu);
+ }
+ return NULL;
+}
+
+static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct proc_dir_entry *pde = seq->private;
+ struct neigh_table *tbl = pde->data;
+ int cpu;
+
+ for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(tbl->stats, cpu);
+ }
+ return NULL;
+}
+
+static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int neigh_stat_seq_show(struct seq_file *seq, void *v)
+{
+ struct proc_dir_entry *pde = seq->private;
+ struct neigh_table *tbl = pde->data;
+ struct neigh_statistics *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx %08lx %08lx\n",
+ atomic_read(&tbl->entries),
+
+ st->allocs,
+ st->destroys,
+ st->hash_grows,
+
+ st->lookups,
+ st->hits,
+
+ st->res_failed,
+
+ st->rcv_probes_mcast,
+ st->rcv_probes_ucast,
+
+ st->periodic_gc_runs,
+ st->forced_gc_runs,
+ st->unres_discards
+ );
+
+ return 0;
+}
+
+static const struct seq_operations neigh_stat_seq_ops = {
+ .start = neigh_stat_seq_start,
+ .next = neigh_stat_seq_next,
+ .stop = neigh_stat_seq_stop,
+ .show = neigh_stat_seq_show,
+};
+
+static int neigh_stat_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &neigh_stat_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ sf->private = PDE(inode);
+ }
+ return ret;
+};
+
+static const struct file_operations neigh_stat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = neigh_stat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static inline size_t neigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4); /* NDA_PROBES */
+}
+
+static void __neigh_notify(struct neighbour *n, int type, int flags)
+{
+ struct net *net = dev_net(n->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = neigh_fill_info(skb, n, 0, 0, type, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+#ifdef CONFIG_ARPD
+void neigh_app_ns(struct neighbour *n)
+{
+ __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
+}
+EXPORT_SYMBOL(neigh_app_ns);
+#endif /* CONFIG_ARPD */
+
+#ifdef CONFIG_SYSCTL
+
+static struct neigh_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table neigh_vars[__NET_NEIGH_MAX];
+ char *dev_name;
+} neigh_sysctl_template __read_mostly = {
+ .neigh_vars = {
+ {
+ .ctl_name = NET_NEIGH_MCAST_SOLICIT,
+ .procname = "mcast_solicit",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NEIGH_UCAST_SOLICIT,
+ .procname = "ucast_solicit",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NEIGH_APP_SOLICIT,
+ .procname = "app_solicit",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "retrans_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_userhz_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_REACHABLE_TIME,
+ .procname = "base_reachable_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_DELAY_PROBE_TIME,
+ .procname = "delay_first_probe_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_GC_STALE_TIME,
+ .procname = "gc_stale_time",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_UNRES_QLEN,
+ .procname = "unres_qlen",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NEIGH_PROXY_QLEN,
+ .procname = "proxy_qlen",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "anycast_delay",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_userhz_jiffies,
+ },
+ {
+ .procname = "proxy_delay",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_userhz_jiffies,
+ },
+ {
+ .procname = "locktime",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_userhz_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_RETRANS_TIME_MS,
+ .procname = "retrans_time_ms",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_ms_jiffies,
+ .strategy = &sysctl_ms_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_REACHABLE_TIME_MS,
+ .procname = "base_reachable_time_ms",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_ms_jiffies,
+ .strategy = &sysctl_ms_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_GC_INTERVAL,
+ .procname = "gc_interval",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_NEIGH_GC_THRESH1,
+ .procname = "gc_thresh1",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NEIGH_GC_THRESH2,
+ .procname = "gc_thresh2",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NEIGH_GC_THRESH3,
+ .procname = "gc_thresh3",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {},
+ },
+};
+
+int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
+ int p_id, int pdev_id, char *p_name,
+ proc_handler *handler, ctl_handler *strategy)
+{
+ struct neigh_sysctl_table *t;
+ const char *dev_name_source = NULL;
+
+#define NEIGH_CTL_PATH_ROOT 0
+#define NEIGH_CTL_PATH_PROTO 1
+#define NEIGH_CTL_PATH_NEIGH 2
+#define NEIGH_CTL_PATH_DEV 3
+
+ struct ctl_path neigh_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "proto", .ctl_name = 0, },
+ { .procname = "neigh", .ctl_name = 0, },
+ { .procname = "default", .ctl_name = NET_PROTO_CONF_DEFAULT, },
+ { },
+ };
+
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto err;
+
+ t->neigh_vars[0].data = &p->mcast_probes;
+ t->neigh_vars[1].data = &p->ucast_probes;
+ t->neigh_vars[2].data = &p->app_probes;
+ t->neigh_vars[3].data = &p->retrans_time;
+ t->neigh_vars[4].data = &p->base_reachable_time;
+ t->neigh_vars[5].data = &p->delay_probe_time;
+ t->neigh_vars[6].data = &p->gc_staletime;
+ t->neigh_vars[7].data = &p->queue_len;
+ t->neigh_vars[8].data = &p->proxy_qlen;
+ t->neigh_vars[9].data = &p->anycast_delay;
+ t->neigh_vars[10].data = &p->proxy_delay;
+ t->neigh_vars[11].data = &p->locktime;
+ t->neigh_vars[12].data = &p->retrans_time;
+ t->neigh_vars[13].data = &p->base_reachable_time;
+
+ if (dev) {
+ dev_name_source = dev->name;
+ neigh_path[NEIGH_CTL_PATH_DEV].ctl_name = dev->ifindex;
+ /* Terminate the table early */
+ memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+ } else {
+ dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
+ t->neigh_vars[14].data = (int *)(p + 1);
+ t->neigh_vars[15].data = (int *)(p + 1) + 1;
+ t->neigh_vars[16].data = (int *)(p + 1) + 2;
+ t->neigh_vars[17].data = (int *)(p + 1) + 3;
+ }
+
+
+ if (handler || strategy) {
+ /* RetransTime */
+ t->neigh_vars[3].proc_handler = handler;
+ t->neigh_vars[3].strategy = strategy;
+ t->neigh_vars[3].extra1 = dev;
+ if (!strategy)
+ t->neigh_vars[3].ctl_name = CTL_UNNUMBERED;
+ /* ReachableTime */
+ t->neigh_vars[4].proc_handler = handler;
+ t->neigh_vars[4].strategy = strategy;
+ t->neigh_vars[4].extra1 = dev;
+ if (!strategy)
+ t->neigh_vars[4].ctl_name = CTL_UNNUMBERED;
+ /* RetransTime (in milliseconds)*/
+ t->neigh_vars[12].proc_handler = handler;
+ t->neigh_vars[12].strategy = strategy;
+ t->neigh_vars[12].extra1 = dev;
+ if (!strategy)
+ t->neigh_vars[12].ctl_name = CTL_UNNUMBERED;
+ /* ReachableTime (in milliseconds) */
+ t->neigh_vars[13].proc_handler = handler;
+ t->neigh_vars[13].strategy = strategy;
+ t->neigh_vars[13].extra1 = dev;
+ if (!strategy)
+ t->neigh_vars[13].ctl_name = CTL_UNNUMBERED;
+ }
+
+ t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
+ if (!t->dev_name)
+ goto free;
+
+ neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name;
+ neigh_path[NEIGH_CTL_PATH_NEIGH].ctl_name = pdev_id;
+ neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name;
+ neigh_path[NEIGH_CTL_PATH_PROTO].ctl_name = p_id;
+
+ t->sysctl_header =
+ register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars);
+ if (!t->sysctl_header)
+ goto free_procname;
+
+ p->sysctl_table = t;
+ return 0;
+
+free_procname:
+ kfree(t->dev_name);
+free:
+ kfree(t);
+err:
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(neigh_sysctl_register);
+
+void neigh_sysctl_unregister(struct neigh_parms *p)
+{
+ if (p->sysctl_table) {
+ struct neigh_sysctl_table *t = p->sysctl_table;
+ p->sysctl_table = NULL;
+ unregister_sysctl_table(t->sysctl_header);
+ kfree(t->dev_name);
+ kfree(t);
+ }
+}
+EXPORT_SYMBOL(neigh_sysctl_unregister);
+
+#endif /* CONFIG_SYSCTL */
+
+static int __init neigh_init(void)
+{
+ rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info);
+
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info);
+ rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL);
+
+ return 0;
+}
+
+subsys_initcall(neigh_init);
+
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
new file mode 100644
index 0000000..92d6b94
--- /dev/null
+++ b/net/core/net-sysfs.c
@@ -0,0 +1,529 @@
+/*
+ * net-sysfs.c - network device class and attributes
+ *
+ * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <net/sock.h>
+#include <linux/rtnetlink.h>
+#include <linux/wireless.h>
+#include <net/iw_handler.h>
+
+#include "net-sysfs.h"
+
+#ifdef CONFIG_SYSFS
+static const char fmt_hex[] = "%#x\n";
+static const char fmt_long_hex[] = "%#lx\n";
+static const char fmt_dec[] = "%d\n";
+static const char fmt_ulong[] = "%lu\n";
+
+static inline int dev_isalive(const struct net_device *dev)
+{
+ return dev->reg_state <= NETREG_REGISTERED;
+}
+
+/* use same locking rules as GIF* ioctl's */
+static ssize_t netdev_show(const struct device *dev,
+ struct device_attribute *attr, char *buf,
+ ssize_t (*format)(const struct net_device *, char *))
+{
+ struct net_device *net = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(net))
+ ret = (*format)(net, buf);
+ read_unlock(&dev_base_lock);
+
+ return ret;
+}
+
+/* generate a show function for simple field */
+#define NETDEVICE_SHOW(field, format_string) \
+static ssize_t format_##field(const struct net_device *net, char *buf) \
+{ \
+ return sprintf(buf, format_string, net->field); \
+} \
+static ssize_t show_##field(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return netdev_show(dev, attr, buf, format_##field); \
+}
+
+
+/* use same locking and permission rules as SIF* ioctl's */
+static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *net = to_net_dev(dev);
+ char *endp;
+ unsigned long new;
+ int ret = -EINVAL;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ new = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ goto err;
+
+ rtnl_lock();
+ if (dev_isalive(net)) {
+ if ((ret = (*set)(net, new)) == 0)
+ ret = len;
+ }
+ rtnl_unlock();
+ err:
+ return ret;
+}
+
+NETDEVICE_SHOW(dev_id, fmt_hex);
+NETDEVICE_SHOW(addr_len, fmt_dec);
+NETDEVICE_SHOW(iflink, fmt_dec);
+NETDEVICE_SHOW(ifindex, fmt_dec);
+NETDEVICE_SHOW(features, fmt_long_hex);
+NETDEVICE_SHOW(type, fmt_dec);
+NETDEVICE_SHOW(link_mode, fmt_dec);
+
+/* use same locking rules as GIFHWADDR ioctl's */
+static ssize_t show_address(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *net = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(net))
+ ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+
+static ssize_t show_broadcast(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *net = to_net_dev(dev);
+ if (dev_isalive(net))
+ return sysfs_format_mac(buf, net->broadcast, net->addr_len);
+ return -EINVAL;
+}
+
+static ssize_t show_carrier(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ if (netif_running(netdev)) {
+ return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ }
+ return -EINVAL;
+}
+
+static ssize_t show_dormant(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+
+ return -EINVAL;
+}
+
+static const char *operstates[] = {
+ "unknown",
+ "notpresent", /* currently unused */
+ "down",
+ "lowerlayerdown",
+ "testing", /* currently unused */
+ "dormant",
+ "up"
+};
+
+static ssize_t show_operstate(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ unsigned char operstate;
+
+ read_lock(&dev_base_lock);
+ operstate = netdev->operstate;
+ if (!netif_running(netdev))
+ operstate = IF_OPER_DOWN;
+ read_unlock(&dev_base_lock);
+
+ if (operstate >= ARRAY_SIZE(operstates))
+ return -EINVAL; /* should not happen */
+
+ return sprintf(buf, "%s\n", operstates[operstate]);
+}
+
+/* read-write attributes */
+NETDEVICE_SHOW(mtu, fmt_dec);
+
+static int change_mtu(struct net_device *net, unsigned long new_mtu)
+{
+ return dev_set_mtu(net, (int) new_mtu);
+}
+
+static ssize_t store_mtu(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_mtu);
+}
+
+NETDEVICE_SHOW(flags, fmt_hex);
+
+static int change_flags(struct net_device *net, unsigned long new_flags)
+{
+ return dev_change_flags(net, (unsigned) new_flags);
+}
+
+static ssize_t store_flags(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_flags);
+}
+
+NETDEVICE_SHOW(tx_queue_len, fmt_ulong);
+
+static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
+{
+ net->tx_queue_len = new_len;
+ return 0;
+}
+
+static ssize_t store_tx_queue_len(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_tx_queue_len);
+}
+
+static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ size_t count = len;
+ ssize_t ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* ignore trailing newline */
+ if (len > 0 && buf[len - 1] == '\n')
+ --count;
+
+ rtnl_lock();
+ ret = dev_set_alias(netdev, buf, count);
+ rtnl_unlock();
+
+ return ret < 0 ? ret : len;
+}
+
+static ssize_t show_ifalias(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = 0;
+
+ rtnl_lock();
+ if (netdev->ifalias)
+ ret = sprintf(buf, "%s\n", netdev->ifalias);
+ rtnl_unlock();
+ return ret;
+}
+
+static struct device_attribute net_class_attributes[] = {
+ __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
+ __ATTR(dev_id, S_IRUGO, show_dev_id, NULL),
+ __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias),
+ __ATTR(iflink, S_IRUGO, show_iflink, NULL),
+ __ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
+ __ATTR(features, S_IRUGO, show_features, NULL),
+ __ATTR(type, S_IRUGO, show_type, NULL),
+ __ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
+ __ATTR(address, S_IRUGO, show_address, NULL),
+ __ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
+ __ATTR(carrier, S_IRUGO, show_carrier, NULL),
+ __ATTR(dormant, S_IRUGO, show_dormant, NULL),
+ __ATTR(operstate, S_IRUGO, show_operstate, NULL),
+ __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
+ __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
+ __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
+ store_tx_queue_len),
+ {}
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t netstat_show(const struct device *d,
+ struct device_attribute *attr, char *buf,
+ unsigned long offset)
+{
+ struct net_device *dev = to_net_dev(d);
+ struct net_device_stats *stats;
+ ssize_t ret = -EINVAL;
+
+ WARN_ON(offset > sizeof(struct net_device_stats) ||
+ offset % sizeof(unsigned long) != 0);
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(dev)) {
+ stats = dev->get_stats(dev);
+ ret = sprintf(buf, fmt_ulong,
+ *(unsigned long *)(((u8 *) stats) + offset));
+ }
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+
+/* generate a read-only statistics attribute */
+#define NETSTAT_ENTRY(name) \
+static ssize_t show_##name(struct device *d, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return netstat_show(d, attr, buf, \
+ offsetof(struct net_device_stats, name)); \
+} \
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+NETSTAT_ENTRY(rx_packets);
+NETSTAT_ENTRY(tx_packets);
+NETSTAT_ENTRY(rx_bytes);
+NETSTAT_ENTRY(tx_bytes);
+NETSTAT_ENTRY(rx_errors);
+NETSTAT_ENTRY(tx_errors);
+NETSTAT_ENTRY(rx_dropped);
+NETSTAT_ENTRY(tx_dropped);
+NETSTAT_ENTRY(multicast);
+NETSTAT_ENTRY(collisions);
+NETSTAT_ENTRY(rx_length_errors);
+NETSTAT_ENTRY(rx_over_errors);
+NETSTAT_ENTRY(rx_crc_errors);
+NETSTAT_ENTRY(rx_frame_errors);
+NETSTAT_ENTRY(rx_fifo_errors);
+NETSTAT_ENTRY(rx_missed_errors);
+NETSTAT_ENTRY(tx_aborted_errors);
+NETSTAT_ENTRY(tx_carrier_errors);
+NETSTAT_ENTRY(tx_fifo_errors);
+NETSTAT_ENTRY(tx_heartbeat_errors);
+NETSTAT_ENTRY(tx_window_errors);
+NETSTAT_ENTRY(rx_compressed);
+NETSTAT_ENTRY(tx_compressed);
+
+static struct attribute *netstat_attrs[] = {
+ &dev_attr_rx_packets.attr,
+ &dev_attr_tx_packets.attr,
+ &dev_attr_rx_bytes.attr,
+ &dev_attr_tx_bytes.attr,
+ &dev_attr_rx_errors.attr,
+ &dev_attr_tx_errors.attr,
+ &dev_attr_rx_dropped.attr,
+ &dev_attr_tx_dropped.attr,
+ &dev_attr_multicast.attr,
+ &dev_attr_collisions.attr,
+ &dev_attr_rx_length_errors.attr,
+ &dev_attr_rx_over_errors.attr,
+ &dev_attr_rx_crc_errors.attr,
+ &dev_attr_rx_frame_errors.attr,
+ &dev_attr_rx_fifo_errors.attr,
+ &dev_attr_rx_missed_errors.attr,
+ &dev_attr_tx_aborted_errors.attr,
+ &dev_attr_tx_carrier_errors.attr,
+ &dev_attr_tx_fifo_errors.attr,
+ &dev_attr_tx_heartbeat_errors.attr,
+ &dev_attr_tx_window_errors.attr,
+ &dev_attr_rx_compressed.attr,
+ &dev_attr_tx_compressed.attr,
+ NULL
+};
+
+
+static struct attribute_group netstat_group = {
+ .name = "statistics",
+ .attrs = netstat_attrs,
+};
+
+#ifdef CONFIG_WIRELESS_EXT_SYSFS
+/* helper function that does all the locking etc for wireless stats */
+static ssize_t wireless_show(struct device *d, char *buf,
+ ssize_t (*format)(const struct iw_statistics *,
+ char *))
+{
+ struct net_device *dev = to_net_dev(d);
+ const struct iw_statistics *iw = NULL;
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(dev)) {
+ if (dev->wireless_handlers &&
+ dev->wireless_handlers->get_wireless_stats)
+ iw = dev->wireless_handlers->get_wireless_stats(dev);
+ if (iw != NULL)
+ ret = (*format)(iw, buf);
+ }
+ read_unlock(&dev_base_lock);
+
+ return ret;
+}
+
+/* show function template for wireless fields */
+#define WIRELESS_SHOW(name, field, format_string) \
+static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \
+{ \
+ return sprintf(buf, format_string, iw->field); \
+} \
+static ssize_t show_iw_##name(struct device *d, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return wireless_show(d, buf, format_iw_##name); \
+} \
+static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL)
+
+WIRELESS_SHOW(status, status, fmt_hex);
+WIRELESS_SHOW(link, qual.qual, fmt_dec);
+WIRELESS_SHOW(level, qual.level, fmt_dec);
+WIRELESS_SHOW(noise, qual.noise, fmt_dec);
+WIRELESS_SHOW(nwid, discard.nwid, fmt_dec);
+WIRELESS_SHOW(crypt, discard.code, fmt_dec);
+WIRELESS_SHOW(fragment, discard.fragment, fmt_dec);
+WIRELESS_SHOW(misc, discard.misc, fmt_dec);
+WIRELESS_SHOW(retries, discard.retries, fmt_dec);
+WIRELESS_SHOW(beacon, miss.beacon, fmt_dec);
+
+static struct attribute *wireless_attrs[] = {
+ &dev_attr_status.attr,
+ &dev_attr_link.attr,
+ &dev_attr_level.attr,
+ &dev_attr_noise.attr,
+ &dev_attr_nwid.attr,
+ &dev_attr_crypt.attr,
+ &dev_attr_fragment.attr,
+ &dev_attr_retries.attr,
+ &dev_attr_misc.attr,
+ &dev_attr_beacon.attr,
+ NULL
+};
+
+static struct attribute_group wireless_group = {
+ .name = "wireless",
+ .attrs = wireless_attrs,
+};
+#endif
+
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_HOTPLUG
+static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
+{
+ struct net_device *dev = to_net_dev(d);
+ int retval;
+
+ /* pass interface to uevent. */
+ retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
+ if (retval)
+ goto exit;
+
+ /* pass ifindex to uevent.
+ * ifindex is useful as it won't change (interface name may change)
+ * and is what RtNetlink uses natively. */
+ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
+
+exit:
+ return retval;
+}
+#endif
+
+/*
+ * netdev_release -- destroy and free a dead device.
+ * Called when last reference to device kobject is gone.
+ */
+static void netdev_release(struct device *d)
+{
+ struct net_device *dev = to_net_dev(d);
+
+ BUG_ON(dev->reg_state != NETREG_RELEASED);
+
+ kfree(dev->ifalias);
+ kfree((char *)dev - dev->padded);
+}
+
+static struct class net_class = {
+ .name = "net",
+ .dev_release = netdev_release,
+#ifdef CONFIG_SYSFS
+ .dev_attrs = net_class_attributes,
+#endif /* CONFIG_SYSFS */
+#ifdef CONFIG_HOTPLUG
+ .dev_uevent = netdev_uevent,
+#endif
+};
+
+/* Delete sysfs entries but hold kobject reference until after all
+ * netdev references are gone.
+ */
+void netdev_unregister_kobject(struct net_device * net)
+{
+ struct device *dev = &(net->dev);
+
+ kobject_get(&dev->kobj);
+ device_del(dev);
+}
+
+/* Create sysfs entries for network device. */
+int netdev_register_kobject(struct net_device *net)
+{
+ struct device *dev = &(net->dev);
+ struct attribute_group **groups = net->sysfs_groups;
+
+ dev->class = &net_class;
+ dev->platform_data = net;
+ dev->groups = groups;
+
+ BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ);
+ strlcpy(dev->bus_id, net->name, BUS_ID_SIZE);
+
+#ifdef CONFIG_SYSFS
+ *groups++ = &netstat_group;
+
+#ifdef CONFIG_WIRELESS_EXT_SYSFS
+ if (net->wireless_handlers && net->wireless_handlers->get_wireless_stats)
+ *groups++ = &wireless_group;
+#endif
+#endif /* CONFIG_SYSFS */
+
+ return device_add(dev);
+}
+
+int netdev_class_create_file(struct class_attribute *class_attr)
+{
+ return class_create_file(&net_class, class_attr);
+}
+
+void netdev_class_remove_file(struct class_attribute *class_attr)
+{
+ class_remove_file(&net_class, class_attr);
+}
+
+EXPORT_SYMBOL(netdev_class_create_file);
+EXPORT_SYMBOL(netdev_class_remove_file);
+
+void netdev_initialize_kobject(struct net_device *net)
+{
+ struct device *device = &(net->dev);
+ device_initialize(device);
+}
+
+int netdev_kobject_init(void)
+{
+ return class_register(&net_class);
+}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
new file mode 100644
index 0000000..14e7524
--- /dev/null
+++ b/net/core/net-sysfs.h
@@ -0,0 +1,8 @@
+#ifndef __NET_SYSFS_H__
+#define __NET_SYSFS_H__
+
+int netdev_kobject_init(void);
+int netdev_register_kobject(struct net_device *);
+void netdev_unregister_kobject(struct net_device *);
+void netdev_initialize_kobject(struct net_device *);
+#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
new file mode 100644
index 0000000..0bc398c
--- /dev/null
+++ b/net/core/net_namespace.c
@@ -0,0 +1,490 @@
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ * Our network namespace constructor/destructor lists
+ */
+
+static LIST_HEAD(pernet_list);
+static struct list_head *first_device = &pernet_list;
+static DEFINE_MUTEX(net_mutex);
+
+LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
+
+struct net init_net;
+EXPORT_SYMBOL(init_net);
+
+#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net)
+{
+ /* Must be called with net_mutex held */
+ struct pernet_operations *ops;
+ int error;
+ struct net_generic *ng;
+
+ atomic_set(&net->count, 1);
+#ifdef NETNS_REFCNT_DEBUG
+ atomic_set(&net->use_count, 0);
+#endif
+
+ error = -ENOMEM;
+ ng = kzalloc(sizeof(struct net_generic) +
+ INITIAL_NET_GEN_PTRS * sizeof(void *), GFP_KERNEL);
+ if (ng == NULL)
+ goto out;
+
+ ng->len = INITIAL_NET_GEN_PTRS;
+ INIT_RCU_HEAD(&ng->rcu);
+ rcu_assign_pointer(net->gen, ng);
+
+ error = 0;
+ list_for_each_entry(ops, &pernet_list, list) {
+ if (ops->init) {
+ error = ops->init(net);
+ if (error < 0)
+ goto out_undo;
+ }
+ }
+out:
+ return error;
+
+out_undo:
+ /* Walk through the list backwards calling the exit functions
+ * for the pernet modules whose init functions did not fail.
+ */
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
+ if (ops->exit)
+ ops->exit(net);
+ }
+
+ rcu_barrier();
+ kfree(ng);
+ goto out;
+}
+
+#ifdef CONFIG_NET_NS
+static struct kmem_cache *net_cachep;
+static struct workqueue_struct *netns_wq;
+
+static struct net *net_alloc(void)
+{
+ return kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+}
+
+static void net_free(struct net *net)
+{
+ if (!net)
+ return;
+
+#ifdef NETNS_REFCNT_DEBUG
+ if (unlikely(atomic_read(&net->use_count) != 0)) {
+ printk(KERN_EMERG "network namespace not free! Usage: %d\n",
+ atomic_read(&net->use_count));
+ return;
+ }
+#endif
+ kfree(net->gen);
+ kmem_cache_free(net_cachep, net);
+}
+
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+ struct net *new_net = NULL;
+ int err;
+
+ get_net(old_net);
+
+ if (!(flags & CLONE_NEWNET))
+ return old_net;
+
+ err = -ENOMEM;
+ new_net = net_alloc();
+ if (!new_net)
+ goto out;
+
+ mutex_lock(&net_mutex);
+ err = setup_net(new_net);
+ if (err)
+ goto out_unlock;
+
+ rtnl_lock();
+ list_add_tail(&new_net->list, &net_namespace_list);
+ rtnl_unlock();
+
+
+out_unlock:
+ mutex_unlock(&net_mutex);
+out:
+ put_net(old_net);
+ if (err) {
+ net_free(new_net);
+ new_net = ERR_PTR(err);
+ }
+ return new_net;
+}
+
+static void cleanup_net(struct work_struct *work)
+{
+ struct pernet_operations *ops;
+ struct net *net;
+
+ /* Be very certain incoming network packets will not find us */
+ rcu_barrier();
+
+ net = container_of(work, struct net, work);
+
+ mutex_lock(&net_mutex);
+
+ /* Don't let anyone else find us. */
+ rtnl_lock();
+ list_del(&net->list);
+ rtnl_unlock();
+
+ /* Run all of the network namespace exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list) {
+ if (ops->exit)
+ ops->exit(net);
+ }
+
+ mutex_unlock(&net_mutex);
+
+ /* Ensure there are no outstanding rcu callbacks using this
+ * network namespace.
+ */
+ rcu_barrier();
+
+ /* Finally it is safe to free my network namespace structure */
+ net_free(net);
+}
+
+void __put_net(struct net *net)
+{
+ /* Cleanup the network namespace in process context */
+ INIT_WORK(&net->work, cleanup_net);
+ queue_work(netns_wq, &net->work);
+}
+EXPORT_SYMBOL_GPL(__put_net);
+
+#else
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+ if (flags & CLONE_NEWNET)
+ return ERR_PTR(-EINVAL);
+ return old_net;
+}
+#endif
+
+static int __init net_ns_init(void)
+{
+ int err;
+
+ printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
+#ifdef CONFIG_NET_NS
+ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
+ SMP_CACHE_BYTES,
+ SLAB_PANIC, NULL);
+
+ /* Create workqueue for cleanup */
+ netns_wq = create_singlethread_workqueue("netns");
+ if (!netns_wq)
+ panic("Could not create netns workq");
+#endif
+
+ mutex_lock(&net_mutex);
+ err = setup_net(&init_net);
+
+ rtnl_lock();
+ list_add_tail(&init_net.list, &net_namespace_list);
+ rtnl_unlock();
+
+ mutex_unlock(&net_mutex);
+ if (err)
+ panic("Could not setup the initial network namespace");
+
+ return 0;
+}
+
+pure_initcall(net_ns_init);
+
+#ifdef CONFIG_NET_NS
+static int register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ struct net *net, *undo_net;
+ int error;
+
+ list_add_tail(&ops->list, list);
+ if (ops->init) {
+ for_each_net(net) {
+ error = ops->init(net);
+ if (error)
+ goto out_undo;
+ }
+ }
+ return 0;
+
+out_undo:
+ /* If I have an error cleanup all namespaces I initialized */
+ list_del(&ops->list);
+ if (ops->exit) {
+ for_each_net(undo_net) {
+ if (undo_net == net)
+ goto undone;
+ ops->exit(undo_net);
+ }
+ }
+undone:
+ return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+ struct net *net;
+
+ list_del(&ops->list);
+ if (ops->exit)
+ for_each_net(net)
+ ops->exit(net);
+}
+
+#else
+
+static int register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ if (ops->init == NULL)
+ return 0;
+ return ops->init(&init_net);
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+ if (ops->exit)
+ ops->exit(&init_net);
+}
+#endif
+
+static DEFINE_IDA(net_generic_ids);
+
+/**
+ * register_pernet_subsys - register a network namespace subsystem
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a subsystem which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_subsys(struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+ error = register_pernet_operations(first_device, ops);
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_subsys);
+
+/**
+ * unregister_pernet_subsys - unregister a network namespace subsystem
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_subsys(struct pernet_operations *module)
+{
+ mutex_lock(&net_mutex);
+ unregister_pernet_operations(module);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
+
+int register_pernet_gen_subsys(int *id, struct pernet_operations *ops)
+{
+ int rv;
+
+ mutex_lock(&net_mutex);
+again:
+ rv = ida_get_new_above(&net_generic_ids, 1, id);
+ if (rv < 0) {
+ if (rv == -EAGAIN) {
+ ida_pre_get(&net_generic_ids, GFP_KERNEL);
+ goto again;
+ }
+ goto out;
+ }
+ rv = register_pernet_operations(first_device, ops);
+ if (rv < 0)
+ ida_remove(&net_generic_ids, *id);
+out:
+ mutex_unlock(&net_mutex);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(register_pernet_gen_subsys);
+
+void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ unregister_pernet_operations(ops);
+ ida_remove(&net_generic_ids, id);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys);
+
+/**
+ * register_pernet_device - register a network namespace device
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a device which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_device(struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+ error = register_pernet_operations(&pernet_list, ops);
+ if (!error && (first_device == &pernet_list))
+ first_device = &ops->list;
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_device);
+
+int register_pernet_gen_device(int *id, struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+again:
+ error = ida_get_new_above(&net_generic_ids, 1, id);
+ if (error) {
+ if (error == -EAGAIN) {
+ ida_pre_get(&net_generic_ids, GFP_KERNEL);
+ goto again;
+ }
+ goto out;
+ }
+ error = register_pernet_operations(&pernet_list, ops);
+ if (error)
+ ida_remove(&net_generic_ids, *id);
+ else if (first_device == &pernet_list)
+ first_device = &ops->list;
+out:
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_gen_device);
+
+/**
+ * unregister_pernet_device - unregister a network namespace netdevice
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_device(struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ if (&ops->list == first_device)
+ first_device = first_device->next;
+ unregister_pernet_operations(ops);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+void unregister_pernet_gen_device(int id, struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ if (&ops->list == first_device)
+ first_device = first_device->next;
+ unregister_pernet_operations(ops);
+ ida_remove(&net_generic_ids, id);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_gen_device);
+
+static void net_generic_release(struct rcu_head *rcu)
+{
+ struct net_generic *ng;
+
+ ng = container_of(rcu, struct net_generic, rcu);
+ kfree(ng);
+}
+
+int net_assign_generic(struct net *net, int id, void *data)
+{
+ struct net_generic *ng, *old_ng;
+
+ BUG_ON(!mutex_is_locked(&net_mutex));
+ BUG_ON(id == 0);
+
+ ng = old_ng = net->gen;
+ if (old_ng->len >= id)
+ goto assign;
+
+ ng = kzalloc(sizeof(struct net_generic) +
+ id * sizeof(void *), GFP_KERNEL);
+ if (ng == NULL)
+ return -ENOMEM;
+
+ /*
+ * Some synchronisation notes:
+ *
+ * The net_generic explores the net->gen array inside rcu
+ * read section. Besides once set the net->gen->ptr[x]
+ * pointer never changes (see rules in netns/generic.h).
+ *
+ * That said, we simply duplicate this array and schedule
+ * the old copy for kfree after a grace period.
+ */
+
+ ng->len = id;
+ INIT_RCU_HEAD(&ng->rcu);
+ memcpy(&ng->ptr, &old_ng->ptr, old_ng->len);
+
+ rcu_assign_pointer(net->gen, ng);
+ call_rcu(&old_ng->rcu, net_generic_release);
+assign:
+ ng->ptr[id - 1] = data;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(net_assign_generic);
diff --git a/net/core/netevent.c b/net/core/netevent.c
new file mode 100644
index 0000000..95f81de
--- /dev/null
+++ b/net/core/netevent.c
@@ -0,0 +1,70 @@
+/*
+ * Network event notifiers
+ *
+ * Authors:
+ * Tom Tucker <tom@opengridcomputing.com>
+ * Steve Wise <swise@opengridcomputing.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <net/netevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
+
+/**
+ * register_netevent_notifier - register a netevent notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when a netevent occurs.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ */
+int register_netevent_notifier(struct notifier_block *nb)
+{
+ int err;
+
+ err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
+ return err;
+}
+
+/**
+ * netevent_unregister_notifier - unregister a netevent notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_neigh_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ */
+
+int unregister_netevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
+}
+
+/**
+ * call_netevent_notifiers - call all netevent notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @v: pointer passed unmodified to notifier function
+ *
+ * Call all neighbour notifier blocks. Parameters and return value
+ * are as for notifier_call_chain().
+ */
+
+int call_netevent_notifiers(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
+}
+
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
+EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
new file mode 100644
index 0000000..dadac62
--- /dev/null
+++ b/net/core/netpoll.c
@@ -0,0 +1,852 @@
+/*
+ * Common framework for low-level network console, dump, and debugger code
+ *
+ * Sep 8 2003 Matt Mackall <mpm@selenic.com>
+ *
+ * based on the netconsole code from:
+ *
+ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Red Hat, Inc.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/unaligned.h>
+
+/*
+ * We maintain a small pool of fully-sized skbs, to make sure the
+ * message gets out even in extreme OOM situations.
+ */
+
+#define MAX_UDP_CHUNK 1460
+#define MAX_SKBS 32
+#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+
+static struct sk_buff_head skb_pool;
+
+static atomic_t trapped;
+
+#define USEC_PER_POLL 50
+#define NETPOLL_RX_ENABLED 1
+#define NETPOLL_RX_DROP 2
+
+#define MAX_SKB_SIZE \
+ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
+ sizeof(struct iphdr) + sizeof(struct ethhdr))
+
+static void zap_completion_queue(void);
+static void arp_reply(struct sk_buff *skb);
+
+static void queue_process(struct work_struct *work)
+{
+ struct netpoll_info *npinfo =
+ container_of(work, struct netpoll_info, tx_work.work);
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ while ((skb = skb_dequeue(&npinfo->txq))) {
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+
+ if (!netif_device_present(dev) || !netif_running(dev)) {
+ __kfree_skb(skb);
+ continue;
+ }
+
+ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+ local_irq_save(flags);
+ __netif_tx_lock(txq, smp_processor_id());
+ if (netif_tx_queue_stopped(txq) ||
+ netif_tx_queue_frozen(txq) ||
+ dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ skb_queue_head(&npinfo->txq, skb);
+ __netif_tx_unlock(txq);
+ local_irq_restore(flags);
+
+ schedule_delayed_work(&npinfo->tx_work, HZ/10);
+ return;
+ }
+ __netif_tx_unlock(txq);
+ local_irq_restore(flags);
+ }
+}
+
+static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
+ unsigned short ulen, __be32 saddr, __be32 daddr)
+{
+ __wsum psum;
+
+ if (uh->check == 0 || skb_csum_unnecessary(skb))
+ return 0;
+
+ psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE &&
+ !csum_fold(csum_add(psum, skb->csum)))
+ return 0;
+
+ skb->csum = psum;
+
+ return __skb_checksum_complete(skb);
+}
+
+/*
+ * Check whether delayed processing was scheduled for our NIC. If so,
+ * we attempt to grab the poll lock and use ->poll() to pump the card.
+ * If this fails, either we've recursed in ->poll() or it's already
+ * running on another CPU.
+ *
+ * Note: we don't mask interrupts with this lock because we're using
+ * trylock here and interrupts are already disabled in the softirq
+ * case. Further, we test the poll_owner to avoid recursion on UP
+ * systems where the lock doesn't exist.
+ *
+ * In cases where there is bi-directional communications, reading only
+ * one message at a time can lead to packets being dropped by the
+ * network adapter, forcing superfluous retries and possibly timeouts.
+ * Thus, we set our budget to greater than 1.
+ */
+static int poll_one_napi(struct netpoll_info *npinfo,
+ struct napi_struct *napi, int budget)
+{
+ int work;
+
+ /* net_rx_action's ->poll() invocations and our's are
+ * synchronized by this test which is only made while
+ * holding the napi->poll_lock.
+ */
+ if (!test_bit(NAPI_STATE_SCHED, &napi->state))
+ return budget;
+
+ npinfo->rx_flags |= NETPOLL_RX_DROP;
+ atomic_inc(&trapped);
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
+
+ work = napi->poll(napi, budget);
+
+ clear_bit(NAPI_STATE_NPSVC, &napi->state);
+ atomic_dec(&trapped);
+ npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+
+ return budget - work;
+}
+
+static void poll_napi(struct net_device *dev)
+{
+ struct napi_struct *napi;
+ int budget = 16;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner != smp_processor_id() &&
+ spin_trylock(&napi->poll_lock)) {
+ budget = poll_one_napi(dev->npinfo, napi, budget);
+ spin_unlock(&napi->poll_lock);
+
+ if (!budget)
+ break;
+ }
+ }
+}
+
+static void service_arp_queue(struct netpoll_info *npi)
+{
+ if (npi) {
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&npi->arp_tx)))
+ arp_reply(skb);
+ }
+}
+
+void netpoll_poll(struct netpoll *np)
+{
+ struct net_device *dev = np->dev;
+
+ if (!dev || !netif_running(dev) || !dev->poll_controller)
+ return;
+
+ /* Process pending work on NIC */
+ dev->poll_controller(dev);
+
+ poll_napi(dev);
+
+ service_arp_queue(dev->npinfo);
+
+ zap_completion_queue();
+}
+
+static void refill_skbs(void)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skb_pool.lock, flags);
+ while (skb_pool.qlen < MAX_SKBS) {
+ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
+ if (!skb)
+ break;
+
+ __skb_queue_tail(&skb_pool, skb);
+ }
+ spin_unlock_irqrestore(&skb_pool.lock, flags);
+}
+
+static void zap_completion_queue(void)
+{
+ unsigned long flags;
+ struct softnet_data *sd = &get_cpu_var(softnet_data);
+
+ if (sd->completion_queue) {
+ struct sk_buff *clist;
+
+ local_irq_save(flags);
+ clist = sd->completion_queue;
+ sd->completion_queue = NULL;
+ local_irq_restore(flags);
+
+ while (clist != NULL) {
+ struct sk_buff *skb = clist;
+ clist = clist->next;
+ if (skb->destructor) {
+ atomic_inc(&skb->users);
+ dev_kfree_skb_any(skb); /* put this one back */
+ } else {
+ __kfree_skb(skb);
+ }
+ }
+ }
+
+ put_cpu_var(softnet_data);
+}
+
+static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
+{
+ int count = 0;
+ struct sk_buff *skb;
+
+ zap_completion_queue();
+ refill_skbs();
+repeat:
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (!skb)
+ skb = skb_dequeue(&skb_pool);
+
+ if (!skb) {
+ if (++count < 10) {
+ netpoll_poll(np);
+ goto repeat;
+ }
+ return NULL;
+ }
+
+ atomic_set(&skb->users, 1);
+ skb_reserve(skb, reserve);
+ return skb;
+}
+
+static int netpoll_owner_active(struct net_device *dev)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner == smp_processor_id())
+ return 1;
+ }
+ return 0;
+}
+
+static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+ int status = NETDEV_TX_BUSY;
+ unsigned long tries;
+ struct net_device *dev = np->dev;
+ struct netpoll_info *npinfo = np->dev->npinfo;
+
+ if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* don't get messages out of order, and no recursion */
+ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
+ struct netdev_queue *txq;
+ unsigned long flags;
+
+ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+ local_irq_save(flags);
+ /* try until next clock tick */
+ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
+ tries > 0; --tries) {
+ if (__netif_tx_trylock(txq)) {
+ if (!netif_tx_queue_stopped(txq))
+ status = dev->hard_start_xmit(skb, dev);
+ __netif_tx_unlock(txq);
+
+ if (status == NETDEV_TX_OK)
+ break;
+
+ }
+
+ /* tickle device maybe there is some cleanup */
+ netpoll_poll(np);
+
+ udelay(USEC_PER_POLL);
+ }
+ local_irq_restore(flags);
+ }
+
+ if (status != NETDEV_TX_OK) {
+ skb_queue_tail(&npinfo->txq, skb);
+ schedule_delayed_work(&npinfo->tx_work,0);
+ }
+}
+
+void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+{
+ int total_len, eth_len, ip_len, udp_len;
+ struct sk_buff *skb;
+ struct udphdr *udph;
+ struct iphdr *iph;
+ struct ethhdr *eth;
+
+ udp_len = len + sizeof(*udph);
+ ip_len = eth_len = udp_len + sizeof(*iph);
+ total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
+
+ skb = find_skb(np, total_len, total_len - len);
+ if (!skb)
+ return;
+
+ skb_copy_to_linear_data(skb, msg, len);
+ skb->len += len;
+
+ skb_push(skb, sizeof(*udph));
+ skb_reset_transport_header(skb);
+ udph = udp_hdr(skb);
+ udph->source = htons(np->local_port);
+ udph->dest = htons(np->remote_port);
+ udph->len = htons(udp_len);
+ udph->check = 0;
+ udph->check = csum_tcpudp_magic(htonl(np->local_ip),
+ htonl(np->remote_ip),
+ udp_len, IPPROTO_UDP,
+ csum_partial((unsigned char *)udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ /* iph->version = 4; iph->ihl = 5; */
+ put_unaligned(0x45, (unsigned char *)iph);
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &(iph->tot_len));
+ iph->id = 0;
+ iph->frag_off = 0;
+ iph->ttl = 64;
+ iph->protocol = IPPROTO_UDP;
+ iph->check = 0;
+ put_unaligned(htonl(np->local_ip), &(iph->saddr));
+ put_unaligned(htonl(np->remote_ip), &(iph->daddr));
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IP);
+ memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN);
+ memcpy(eth->h_dest, np->remote_mac, ETH_ALEN);
+
+ skb->dev = np->dev;
+
+ netpoll_send_skb(np, skb);
+}
+
+static void arp_reply(struct sk_buff *skb)
+{
+ struct netpoll_info *npinfo = skb->dev->npinfo;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
+ __be32 sip, tip;
+ unsigned char *sha;
+ struct sk_buff *send_skb;
+ struct netpoll *np = NULL;
+
+ if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+ np = npinfo->rx_np;
+ if (!np)
+ return;
+
+ /* No arp on this interface */
+ if (skb->dev->flags & IFF_NOARP)
+ return;
+
+ if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
+ return;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ arp = arp_hdr(skb);
+
+ if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_op != htons(ARPOP_REQUEST))
+ return;
+
+ arp_ptr = (unsigned char *)(arp+1);
+ /* save the location of the src hw addr */
+ sha = arp_ptr;
+ arp_ptr += skb->dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ /* if we actually cared about dst hw addr, it would get copied here */
+ arp_ptr += skb->dev->addr_len;
+ memcpy(&tip, arp_ptr, 4);
+
+ /* Should we ignore arp? */
+ if (tip != htonl(np->local_ip) ||
+ ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+ return;
+
+ size = arp_hdr_len(skb->dev);
+ send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
+ LL_RESERVED_SPACE(np->dev));
+
+ if (!send_skb)
+ return;
+
+ skb_reset_network_header(send_skb);
+ arp = (struct arphdr *) skb_put(send_skb, size);
+ send_skb->dev = skb->dev;
+ send_skb->protocol = htons(ETH_P_ARP);
+
+ /* Fill the device header for the ARP frame */
+ if (dev_hard_header(send_skb, skb->dev, ptype,
+ sha, np->dev->dev_addr,
+ send_skb->len) < 0) {
+ kfree_skb(send_skb);
+ return;
+ }
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * we only support ethernet device type,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+
+ arp->ar_hrd = htons(np->dev->type);
+ arp->ar_pro = htons(ETH_P_IP);
+ arp->ar_hln = np->dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr=(unsigned char *)(arp + 1);
+ memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
+ arp_ptr += np->dev->addr_len;
+ memcpy(arp_ptr, &tip, 4);
+ arp_ptr += 4;
+ memcpy(arp_ptr, sha, np->dev->addr_len);
+ arp_ptr += np->dev->addr_len;
+ memcpy(arp_ptr, &sip, 4);
+
+ netpoll_send_skb(np, send_skb);
+}
+
+int __netpoll_rx(struct sk_buff *skb)
+{
+ int proto, len, ulen;
+ struct iphdr *iph;
+ struct udphdr *uh;
+ struct netpoll_info *npi = skb->dev->npinfo;
+ struct netpoll *np = npi->rx_np;
+
+ if (!np)
+ goto out;
+ if (skb->dev->type != ARPHRD_ETHER)
+ goto out;
+
+ /* check if netpoll clients need ARP */
+ if (skb->protocol == htons(ETH_P_ARP) &&
+ atomic_read(&trapped)) {
+ skb_queue_tail(&npi->arp_tx, skb);
+ return 1;
+ }
+
+ proto = ntohs(eth_hdr(skb)->h_proto);
+ if (proto != ETH_P_IP)
+ goto out;
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto out;
+ if (skb_shared(skb))
+ goto out;
+
+ iph = (struct iphdr *)skb->data;
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out;
+ if (iph->ihl < 5 || iph->version != 4)
+ goto out;
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto out;
+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+ goto out;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len || len < iph->ihl*4)
+ goto out;
+
+ /*
+ * Our transport medium may have padded the buffer out.
+ * Now We trim to the true length of the frame.
+ */
+ if (pskb_trim_rcsum(skb, len))
+ goto out;
+
+ if (iph->protocol != IPPROTO_UDP)
+ goto out;
+
+ len -= iph->ihl*4;
+ uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
+ ulen = ntohs(uh->len);
+
+ if (ulen != len)
+ goto out;
+ if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
+ goto out;
+ if (np->local_ip && np->local_ip != ntohl(iph->daddr))
+ goto out;
+ if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
+ goto out;
+ if (np->local_port && np->local_port != ntohs(uh->dest))
+ goto out;
+
+ np->rx_hook(np, ntohs(uh->source),
+ (char *)(uh+1),
+ ulen - sizeof(struct udphdr));
+
+ kfree_skb(skb);
+ return 1;
+
+out:
+ if (atomic_read(&trapped)) {
+ kfree_skb(skb);
+ return 1;
+ }
+
+ return 0;
+}
+
+void netpoll_print_options(struct netpoll *np)
+{
+ DECLARE_MAC_BUF(mac);
+ printk(KERN_INFO "%s: local port %d\n",
+ np->name, np->local_port);
+ printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
+ np->name, HIPQUAD(np->local_ip));
+ printk(KERN_INFO "%s: interface %s\n",
+ np->name, np->dev_name);
+ printk(KERN_INFO "%s: remote port %d\n",
+ np->name, np->remote_port);
+ printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n",
+ np->name, HIPQUAD(np->remote_ip));
+ printk(KERN_INFO "%s: remote ethernet address %s\n",
+ np->name, print_mac(mac, np->remote_mac));
+}
+
+int netpoll_parse_options(struct netpoll *np, char *opt)
+{
+ char *cur=opt, *delim;
+
+ if (*cur != '@') {
+ if ((delim = strchr(cur, '@')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->local_port = simple_strtol(cur, NULL, 10);
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != '/') {
+ if ((delim = strchr(cur, '/')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->local_ip = ntohl(in_aton(cur));
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != ',') {
+ /* parse out dev name */
+ if ((delim = strchr(cur, ',')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ strlcpy(np->dev_name, cur, sizeof(np->dev_name));
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != '@') {
+ /* dst port */
+ if ((delim = strchr(cur, '@')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_port = simple_strtol(cur, NULL, 10);
+ cur = delim;
+ }
+ cur++;
+
+ /* dst ip */
+ if ((delim = strchr(cur, '/')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_ip = ntohl(in_aton(cur));
+ cur = delim + 1;
+
+ if (*cur != 0) {
+ /* MAC address */
+ if ((delim = strchr(cur, ':')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_mac[0] = simple_strtol(cur, NULL, 16);
+ cur = delim + 1;
+ if ((delim = strchr(cur, ':')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_mac[1] = simple_strtol(cur, NULL, 16);
+ cur = delim + 1;
+ if ((delim = strchr(cur, ':')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_mac[2] = simple_strtol(cur, NULL, 16);
+ cur = delim + 1;
+ if ((delim = strchr(cur, ':')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_mac[3] = simple_strtol(cur, NULL, 16);
+ cur = delim + 1;
+ if ((delim = strchr(cur, ':')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ np->remote_mac[4] = simple_strtol(cur, NULL, 16);
+ cur = delim + 1;
+ np->remote_mac[5] = simple_strtol(cur, NULL, 16);
+ }
+
+ netpoll_print_options(np);
+
+ return 0;
+
+ parse_failed:
+ printk(KERN_INFO "%s: couldn't parse config at %s!\n",
+ np->name, cur);
+ return -1;
+}
+
+int netpoll_setup(struct netpoll *np)
+{
+ struct net_device *ndev = NULL;
+ struct in_device *in_dev;
+ struct netpoll_info *npinfo;
+ unsigned long flags;
+ int err;
+
+ if (np->dev_name)
+ ndev = dev_get_by_name(&init_net, np->dev_name);
+ if (!ndev) {
+ printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
+ np->name, np->dev_name);
+ return -ENODEV;
+ }
+
+ np->dev = ndev;
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto release;
+ }
+
+ npinfo->rx_flags = 0;
+ npinfo->rx_np = NULL;
+
+ spin_lock_init(&npinfo->rx_lock);
+ skb_queue_head_init(&npinfo->arp_tx);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+ atomic_set(&npinfo->refcnt, 1);
+ } else {
+ npinfo = ndev->npinfo;
+ atomic_inc(&npinfo->refcnt);
+ }
+
+ if (!ndev->poll_controller) {
+ printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
+ np->name, np->dev_name);
+ err = -ENOTSUPP;
+ goto release;
+ }
+
+ if (!netif_running(ndev)) {
+ unsigned long atmost, atleast;
+
+ printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
+ np->name, np->dev_name);
+
+ rtnl_lock();
+ err = dev_open(ndev);
+ rtnl_unlock();
+
+ if (err) {
+ printk(KERN_ERR "%s: failed to open %s\n",
+ np->name, ndev->name);
+ goto release;
+ }
+
+ atleast = jiffies + HZ/10;
+ atmost = jiffies + 4*HZ;
+ while (!netif_carrier_ok(ndev)) {
+ if (time_after(jiffies, atmost)) {
+ printk(KERN_NOTICE
+ "%s: timeout waiting for carrier\n",
+ np->name);
+ break;
+ }
+ cond_resched();
+ }
+
+ /* If carrier appears to come up instantly, we don't
+ * trust it and pause so that we don't pump all our
+ * queued console messages into the bitbucket.
+ */
+
+ if (time_before(jiffies, atleast)) {
+ printk(KERN_NOTICE "%s: carrier detect appears"
+ " untrustworthy, waiting 4 seconds\n",
+ np->name);
+ msleep(4000);
+ }
+ }
+
+ if (!np->local_ip) {
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(ndev);
+
+ if (!in_dev || !in_dev->ifa_list) {
+ rcu_read_unlock();
+ printk(KERN_ERR "%s: no IP address for %s, aborting\n",
+ np->name, np->dev_name);
+ err = -EDESTADDRREQ;
+ goto release;
+ }
+
+ np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
+ rcu_read_unlock();
+ printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
+ np->name, HIPQUAD(np->local_ip));
+ }
+
+ if (np->rx_hook) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+ npinfo->rx_np = np;
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
+
+ /* fill up the skb queue */
+ refill_skbs();
+
+ /* last thing to do is link it to the net device structure */
+ ndev->npinfo = npinfo;
+
+ /* avoid racing with NAPI reading npinfo */
+ synchronize_rcu();
+
+ return 0;
+
+ release:
+ if (!ndev->npinfo)
+ kfree(npinfo);
+ np->dev = NULL;
+ dev_put(ndev);
+ return err;
+}
+
+static int __init netpoll_init(void)
+{
+ skb_queue_head_init(&skb_pool);
+ return 0;
+}
+core_initcall(netpoll_init);
+
+void netpoll_cleanup(struct netpoll *np)
+{
+ struct netpoll_info *npinfo;
+ unsigned long flags;
+
+ if (np->dev) {
+ npinfo = np->dev->npinfo;
+ if (npinfo) {
+ if (npinfo->rx_np == np) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ npinfo->rx_np = NULL;
+ npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
+
+ if (atomic_dec_and_test(&npinfo->refcnt)) {
+ skb_queue_purge(&npinfo->arp_tx);
+ skb_queue_purge(&npinfo->txq);
+ cancel_rearming_delayed_work(&npinfo->tx_work);
+
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ kfree(npinfo);
+ np->dev->npinfo = NULL;
+ }
+ }
+
+ dev_put(np->dev);
+ }
+
+ np->dev = NULL;
+}
+
+int netpoll_trap(void)
+{
+ return atomic_read(&trapped);
+}
+
+void netpoll_set_trap(int trap)
+{
+ if (trap)
+ atomic_inc(&trapped);
+ else
+ atomic_dec(&trapped);
+}
+
+EXPORT_SYMBOL(netpoll_set_trap);
+EXPORT_SYMBOL(netpoll_trap);
+EXPORT_SYMBOL(netpoll_print_options);
+EXPORT_SYMBOL(netpoll_parse_options);
+EXPORT_SYMBOL(netpoll_setup);
+EXPORT_SYMBOL(netpoll_cleanup);
+EXPORT_SYMBOL(netpoll_send_udp);
+EXPORT_SYMBOL(netpoll_poll);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
new file mode 100644
index 0000000..8997e91
--- /dev/null
+++ b/net/core/pktgen.c
@@ -0,0 +1,3859 @@
+/*
+ * Authors:
+ * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
+ * Uppsala University and
+ * Swedish University of Agricultural Sciences
+ *
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Ben Greear <greearb@candelatech.com>
+ * Jens Låås <jens.laas@data.slu.se>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * A tool for loading the network with preconfigurated packets.
+ * The tool is implemented as a linux module. Parameters are output
+ * device, delay (to hard_xmit), number of packets, and whether
+ * to use multiple SKBs or just the same one.
+ * pktgen uses the installed interface's output routine.
+ *
+ * Additional hacking by:
+ *
+ * Jens.Laas@data.slu.se
+ * Improved by ANK. 010120.
+ * Improved by ANK even more. 010212.
+ * MAC address typo fixed. 010417 --ro
+ * Integrated. 020301 --DaveM
+ * Added multiskb option 020301 --DaveM
+ * Scaling of results. 020417--sigurdur@linpro.no
+ * Significant re-work of the module:
+ * * Convert to threaded model to more efficiently be able to transmit
+ * and receive on multiple interfaces at once.
+ * * Converted many counters to __u64 to allow longer runs.
+ * * Allow configuration of ranges, like min/max IP address, MACs,
+ * and UDP-ports, for both source and destination, and can
+ * set to use a random distribution or sequentially walk the range.
+ * * Can now change most values after starting.
+ * * Place 12-byte packet in UDP payload with magic number,
+ * sequence number, and timestamp.
+ * * Add receiver code that detects dropped pkts, re-ordered pkts, and
+ * latencies (with micro-second) precision.
+ * * Add IOCTL interface to easily get counters & configuration.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
+ * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
+ * as a "fastpath" with a configurable number of clones after alloc's.
+ * clone_skb=0 means all packets are allocated this also means ranges time
+ * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
+ * clones.
+ *
+ * Also moved to /proc/net/pktgen/
+ * --ro
+ *
+ * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever
+ * mistakes. Also merged in DaveM's patch in the -pre6 patch.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
+ *
+ *
+ * 021124 Finished major redesign and rewrite for new functionality.
+ * See Documentation/networking/pktgen.txt for how to use this.
+ *
+ * The new operation:
+ * For each CPU one thread/process is created at start. This process checks
+ * for running devices in the if_list and sends packets until count is 0 it
+ * also the thread checks the thread->control which is used for inter-process
+ * communication. controlling process "posts" operations to the threads this
+ * way. The if_lock should be possible to remove when add/rem_device is merged
+ * into this too.
+ *
+ * By design there should only be *one* "controlling" process. In practice
+ * multiple write accesses gives unpredictable result. Understood by "write"
+ * to /proc gives result code thats should be read be the "writer".
+ * For practical use this should be no problem.
+ *
+ * Note when adding devices to a specific CPU there good idea to also assign
+ * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
+ * --ro
+ *
+ * Fix refcount off by one if first packet fails, potential null deref,
+ * memleak 030710- KJP
+ *
+ * First "ranges" functionality for ipv6 030726 --ro
+ *
+ * Included flow support. 030802 ANK.
+ *
+ * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
+ *
+ * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
+ * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604
+ *
+ * New xmit() return, do_div and misc clean up by Stephen Hemminger
+ * <shemminger@osdl.org> 040923
+ *
+ * Randy Dunlap fixed u64 printk compiler waring
+ *
+ * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
+ * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
+ *
+ * Corrections from Nikolai Malykh (nmalykh@bilim.com)
+ * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
+ *
+ * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
+ * 050103
+ *
+ * MPLS support by Steven Whitehouse <steve@chygwyn.com>
+ *
+ * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
+ *
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
+ *
+ */
+#include <linux/sys.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/unistd.h>
+#include <linux/string.h>
+#include <linux/ptrace.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/wait.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <net/net_namespace.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#ifdef CONFIG_XFRM
+#include <net/xfrm.h>
+#endif
+#include <asm/byteorder.h>
+#include <linux/rcupdate.h>
+#include <linux/bitops.h>
+#include <asm/io.h>
+#include <asm/dma.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h> /* do_div */
+#include <asm/timex.h>
+
+#define VERSION "pktgen v2.70: Packet Generator for packet performance testing.\n"
+
+#define IP_NAME_SZ 32
+#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
+#define MPLS_STACK_BOTTOM htonl(0x00000100)
+
+/* Device flag bits */
+#define F_IPSRC_RND (1<<0) /* IP-Src Random */
+#define F_IPDST_RND (1<<1) /* IP-Dst Random */
+#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */
+#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */
+#define F_MACSRC_RND (1<<4) /* MAC-Src Random */
+#define F_MACDST_RND (1<<5) /* MAC-Dst Random */
+#define F_TXSIZE_RND (1<<6) /* Transmit size is random */
+#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */
+#define F_MPLS_RND (1<<8) /* Random MPLS labels */
+#define F_VID_RND (1<<9) /* Random VLAN ID */
+#define F_SVID_RND (1<<10) /* Random SVLAN ID */
+#define F_FLOW_SEQ (1<<11) /* Sequential flows */
+#define F_IPSEC_ON (1<<12) /* ipsec on for flows */
+#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */
+#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
+
+/* Thread control flag bits */
+#define T_TERMINATE (1<<0)
+#define T_STOP (1<<1) /* Stop run */
+#define T_RUN (1<<2) /* Start run */
+#define T_REMDEVALL (1<<3) /* Remove all devs */
+#define T_REMDEV (1<<4) /* Remove one dev */
+
+/* If lock -- can be removed after some work */
+#define if_lock(t) spin_lock(&(t->if_lock));
+#define if_unlock(t) spin_unlock(&(t->if_lock));
+
+/* Used to help with determining the pkts on receive */
+#define PKTGEN_MAGIC 0xbe9be955
+#define PG_PROC_DIR "pktgen"
+#define PGCTRL "pgctrl"
+static struct proc_dir_entry *pg_proc_dir = NULL;
+
+#define MAX_CFLOWS 65536
+
+#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
+#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+
+struct flow_state {
+ __be32 cur_daddr;
+ int count;
+#ifdef CONFIG_XFRM
+ struct xfrm_state *x;
+#endif
+ __u32 flags;
+};
+
+/* flow flag bits */
+#define F_INIT (1<<0) /* flow has been initialized */
+
+struct pktgen_dev {
+ /*
+ * Try to keep frequent/infrequent used vars. separated.
+ */
+ struct proc_dir_entry *entry; /* proc file */
+ struct pktgen_thread *pg_thread;/* the owner */
+ struct list_head list; /* Used for chaining in the thread's run-queue */
+
+ int running; /* if this changes to false, the test will stop */
+
+ /* If min != max, then we will either do a linear iteration, or
+ * we will do a random selection from within the range.
+ */
+ __u32 flags;
+ int removal_mark; /* non-zero => the device is marked for
+ * removal by worker thread */
+
+ int min_pkt_size; /* = ETH_ZLEN; */
+ int max_pkt_size; /* = ETH_ZLEN; */
+ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
+ int nfrags;
+ __u32 delay_us; /* Default delay */
+ __u32 delay_ns;
+ __u64 count; /* Default No packets to send */
+ __u64 sofar; /* How many pkts we've sent so far */
+ __u64 tx_bytes; /* How many bytes we've transmitted */
+ __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */
+
+ /* runtime counters relating to clone_skb */
+ __u64 next_tx_us; /* timestamp of when to tx next */
+ __u32 next_tx_ns;
+
+ __u64 allocated_skbs;
+ __u32 clone_count;
+ int last_ok; /* Was last skb sent?
+ * Or a failed transmit of some sort? This will keep
+ * sequence numbers in order, for example.
+ */
+ __u64 started_at; /* micro-seconds */
+ __u64 stopped_at; /* micro-seconds */
+ __u64 idle_acc; /* micro-seconds */
+ __u32 seq_num;
+
+ int clone_skb; /* Use multiple SKBs during packet gen. If this number
+ * is greater than 1, then that many copies of the same
+ * packet will be sent before a new packet is allocated.
+ * For instance, if you want to send 1024 identical packets
+ * before creating a new packet, set clone_skb to 1024.
+ */
+
+ char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+
+ struct in6_addr in6_saddr;
+ struct in6_addr in6_daddr;
+ struct in6_addr cur_in6_daddr;
+ struct in6_addr cur_in6_saddr;
+ /* For ranges */
+ struct in6_addr min_in6_daddr;
+ struct in6_addr max_in6_daddr;
+ struct in6_addr min_in6_saddr;
+ struct in6_addr max_in6_saddr;
+
+ /* If we're doing ranges, random or incremental, then this
+ * defines the min/max for those ranges.
+ */
+ __be32 saddr_min; /* inclusive, source IP address */
+ __be32 saddr_max; /* exclusive, source IP address */
+ __be32 daddr_min; /* inclusive, dest IP address */
+ __be32 daddr_max; /* exclusive, dest IP address */
+
+ __u16 udp_src_min; /* inclusive, source UDP port */
+ __u16 udp_src_max; /* exclusive, source UDP port */
+ __u16 udp_dst_min; /* inclusive, dest UDP port */
+ __u16 udp_dst_max; /* exclusive, dest UDP port */
+
+ /* DSCP + ECN */
+ __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */
+
+ /* MPLS */
+ unsigned nr_labels; /* Depth of stack, 0 = no MPLS */
+ __be32 labels[MAX_MPLS_LABELS];
+
+ /* VLAN/SVLAN (802.1Q/Q-in-Q) */
+ __u8 vlan_p;
+ __u8 vlan_cfi;
+ __u16 vlan_id; /* 0xffff means no vlan tag */
+
+ __u8 svlan_p;
+ __u8 svlan_cfi;
+ __u16 svlan_id; /* 0xffff means no svlan tag */
+
+ __u32 src_mac_count; /* How many MACs to iterate through */
+ __u32 dst_mac_count; /* How many MACs to iterate through */
+
+ unsigned char dst_mac[ETH_ALEN];
+ unsigned char src_mac[ETH_ALEN];
+
+ __u32 cur_dst_mac_offset;
+ __u32 cur_src_mac_offset;
+ __be32 cur_saddr;
+ __be32 cur_daddr;
+ __u16 cur_udp_dst;
+ __u16 cur_udp_src;
+ __u16 cur_queue_map;
+ __u32 cur_pkt_size;
+
+ __u8 hh[14];
+ /* = {
+ 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+
+ We fill in SRC address later
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x08, 0x00
+ };
+ */
+ __u16 pad; /* pad out the hh struct to an even 16 bytes */
+
+ struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we
+ * are transmitting the same one multiple times
+ */
+ struct net_device *odev; /* The out-going device. Note that the device should
+ * have it's pg_info pointer pointing back to this
+ * device. This will be set when the user specifies
+ * the out-going device name (not when the inject is
+ * started as it used to do.)
+ */
+ struct flow_state *flows;
+ unsigned cflows; /* Concurrent flows (config) */
+ unsigned lflow; /* Flow length (config) */
+ unsigned nflows; /* accumulated flows (stats) */
+ unsigned curfl; /* current sequenced flow (state)*/
+
+ u16 queue_map_min;
+ u16 queue_map_max;
+
+#ifdef CONFIG_XFRM
+ __u8 ipsmode; /* IPSEC mode (config) */
+ __u8 ipsproto; /* IPSEC type (config) */
+#endif
+ char result[512];
+};
+
+struct pktgen_hdr {
+ __be32 pgh_magic;
+ __be32 seq_num;
+ __be32 tv_sec;
+ __be32 tv_usec;
+};
+
+struct pktgen_thread {
+ spinlock_t if_lock;
+ struct list_head if_list; /* All device here */
+ struct list_head th_list;
+ struct task_struct *tsk;
+ char result[512];
+
+ /* Field for thread to receive "posted" events terminate, stop ifs etc. */
+
+ u32 control;
+ int cpu;
+
+ wait_queue_head_t queue;
+ struct completion start_done;
+};
+
+#define REMOVE 1
+#define FIND 0
+
+/** Convert to micro-seconds */
+static inline __u64 tv_to_us(const struct timeval *tv)
+{
+ __u64 us = tv->tv_usec;
+ us += (__u64) tv->tv_sec * (__u64) 1000000;
+ return us;
+}
+
+static __u64 getCurUs(void)
+{
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ return tv_to_us(&tv);
+}
+
+/* old include end */
+
+static char version[] __initdata = VERSION;
+
+static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname);
+static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
+static void pktgen_run_all_threads(void);
+static void pktgen_stop_all_threads_ifs(void);
+static int pktgen_stop_device(struct pktgen_dev *pkt_dev);
+static void pktgen_stop(struct pktgen_thread *t);
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+
+static unsigned int scan_ip6(const char *s, char ip[16]);
+static unsigned int fmt_ip6(char *s, const char ip[16]);
+
+/* Module parameters, defaults. */
+static int pg_count_d = 1000; /* 1000 pkts by default */
+static int pg_delay_d;
+static int pg_clone_skb_d;
+static int debug;
+
+static DEFINE_MUTEX(pktgen_thread_lock);
+static LIST_HEAD(pktgen_threads);
+
+static struct notifier_block pktgen_notifier_block = {
+ .notifier_call = pktgen_device_event,
+};
+
+/*
+ * /proc handling functions
+ *
+ */
+
+static int pgctrl_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, VERSION);
+ return 0;
+}
+
+static ssize_t pgctrl_write(struct file *file, const char __user * buf,
+ size_t count, loff_t * ppos)
+{
+ int err = 0;
+ char data[128];
+
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ if (count > sizeof(data))
+ count = sizeof(data);
+
+ if (copy_from_user(data, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
+ data[count - 1] = 0; /* Make string */
+
+ if (!strcmp(data, "stop"))
+ pktgen_stop_all_threads_ifs();
+
+ else if (!strcmp(data, "start"))
+ pktgen_run_all_threads();
+
+ else
+ printk(KERN_WARNING "pktgen: Unknown command: %s\n", data);
+
+ err = count;
+
+out:
+ return err;
+}
+
+static int pgctrl_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pgctrl_show, PDE(inode)->data);
+}
+
+static const struct file_operations pktgen_fops = {
+ .owner = THIS_MODULE,
+ .open = pgctrl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pgctrl_write,
+ .release = single_release,
+};
+
+static int pktgen_if_show(struct seq_file *seq, void *v)
+{
+ struct pktgen_dev *pkt_dev = seq->private;
+ __u64 sa;
+ __u64 stopped;
+ __u64 now = getCurUs();
+ DECLARE_MAC_BUF(mac);
+
+ seq_printf(seq,
+ "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
+ (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
+ pkt_dev->max_pkt_size);
+
+ seq_printf(seq,
+ " frags: %d delay: %u clone_skb: %d ifname: %s\n",
+ pkt_dev->nfrags,
+ 1000 * pkt_dev->delay_us + pkt_dev->delay_ns,
+ pkt_dev->clone_skb, pkt_dev->odev->name);
+
+ seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows,
+ pkt_dev->lflow);
+
+ seq_printf(seq,
+ " queue_map_min: %u queue_map_max: %u\n",
+ pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
+
+ if (pkt_dev->flags & F_IPV6) {
+ char b1[128], b2[128], b3[128];
+ fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr);
+ fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
+ fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
+ seq_printf(seq,
+ " saddr: %s min_saddr: %s max_saddr: %s\n", b1,
+ b2, b3);
+
+ fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr);
+ fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
+ fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
+ seq_printf(seq,
+ " daddr: %s min_daddr: %s max_daddr: %s\n", b1,
+ b2, b3);
+
+ } else
+ seq_printf(seq,
+ " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n",
+ pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min,
+ pkt_dev->src_max);
+
+ seq_puts(seq, " src_mac: ");
+
+ seq_printf(seq, "%s ",
+ print_mac(mac, is_zero_ether_addr(pkt_dev->src_mac) ?
+ pkt_dev->odev->dev_addr : pkt_dev->src_mac));
+
+ seq_printf(seq, "dst_mac: ");
+ seq_printf(seq, "%s\n", print_mac(mac, pkt_dev->dst_mac));
+
+ seq_printf(seq,
+ " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
+ pkt_dev->udp_src_min, pkt_dev->udp_src_max,
+ pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
+
+ seq_printf(seq,
+ " src_mac_count: %d dst_mac_count: %d\n",
+ pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
+
+ if (pkt_dev->nr_labels) {
+ unsigned i;
+ seq_printf(seq, " mpls: ");
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
+ i == pkt_dev->nr_labels-1 ? "\n" : ", ");
+ }
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->vlan_id, pkt_dev->vlan_p, pkt_dev->vlan_cfi);
+ }
+
+ if (pkt_dev->svlan_id != 0xffff) {
+ seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->svlan_id, pkt_dev->svlan_p, pkt_dev->svlan_cfi);
+ }
+
+ if (pkt_dev->tos) {
+ seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos);
+ }
+
+ if (pkt_dev->traffic_class) {
+ seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
+ }
+
+ seq_printf(seq, " Flags: ");
+
+ if (pkt_dev->flags & F_IPV6)
+ seq_printf(seq, "IPV6 ");
+
+ if (pkt_dev->flags & F_IPSRC_RND)
+ seq_printf(seq, "IPSRC_RND ");
+
+ if (pkt_dev->flags & F_IPDST_RND)
+ seq_printf(seq, "IPDST_RND ");
+
+ if (pkt_dev->flags & F_TXSIZE_RND)
+ seq_printf(seq, "TXSIZE_RND ");
+
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ seq_printf(seq, "UDPSRC_RND ");
+
+ if (pkt_dev->flags & F_UDPDST_RND)
+ seq_printf(seq, "UDPDST_RND ");
+
+ if (pkt_dev->flags & F_MPLS_RND)
+ seq_printf(seq, "MPLS_RND ");
+
+ if (pkt_dev->flags & F_QUEUE_MAP_RND)
+ seq_printf(seq, "QUEUE_MAP_RND ");
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ seq_printf(seq, "QUEUE_MAP_CPU ");
+
+ if (pkt_dev->cflows) {
+ if (pkt_dev->flags & F_FLOW_SEQ)
+ seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/
+ else
+ seq_printf(seq, "FLOW_RND ");
+ }
+
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC_ON)
+ seq_printf(seq, "IPSEC ");
+#endif
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ seq_printf(seq, "MACSRC_RND ");
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ seq_printf(seq, "MACDST_RND ");
+
+ if (pkt_dev->flags & F_VID_RND)
+ seq_printf(seq, "VID_RND ");
+
+ if (pkt_dev->flags & F_SVID_RND)
+ seq_printf(seq, "SVID_RND ");
+
+ seq_puts(seq, "\n");
+
+ sa = pkt_dev->started_at;
+ stopped = pkt_dev->stopped_at;
+ if (pkt_dev->running)
+ stopped = now; /* not really stopped, more like last-running-at */
+
+ seq_printf(seq,
+ "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n",
+ (unsigned long long)pkt_dev->sofar,
+ (unsigned long long)pkt_dev->errors, (unsigned long long)sa,
+ (unsigned long long)stopped,
+ (unsigned long long)pkt_dev->idle_acc);
+
+ seq_printf(seq,
+ " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
+ pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset,
+ pkt_dev->cur_src_mac_offset);
+
+ if (pkt_dev->flags & F_IPV6) {
+ char b1[128], b2[128];
+ fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr);
+ fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr);
+ seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
+ } else
+ seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
+ pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+
+ seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
+ pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
+
+ seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map);
+
+ seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
+
+ if (pkt_dev->result[0])
+ seq_printf(seq, "Result: %s\n", pkt_dev->result);
+ else
+ seq_printf(seq, "Result: Idle\n");
+
+ return 0;
+}
+
+
+static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, __u32 *num)
+{
+ int i = 0;
+ *num = 0;
+
+ for (; i < maxlen; i++) {
+ char c;
+ *num <<= 4;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ if ((c >= '0') && (c <= '9'))
+ *num |= c - '0';
+ else if ((c >= 'a') && (c <= 'f'))
+ *num |= c - 'a' + 10;
+ else if ((c >= 'A') && (c <= 'F'))
+ *num |= c - 'A' + 10;
+ else
+ break;
+ }
+ return i;
+}
+
+static int count_trail_chars(const char __user * user_buffer,
+ unsigned int maxlen)
+{
+ int i;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
+ case '\"':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ case '=':
+ break;
+ default:
+ goto done;
+ }
+ }
+done:
+ return i;
+}
+
+static unsigned long num_arg(const char __user * user_buffer,
+ unsigned long maxlen, unsigned long *num)
+{
+ int i = 0;
+ *num = 0;
+
+ for (; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ if ((c >= '0') && (c <= '9')) {
+ *num *= 10;
+ *num += c - '0';
+ } else
+ break;
+ }
+ return i;
+}
+
+static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+{
+ int i = 0;
+
+ for (; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
+ case '\"':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ goto done_str;
+ break;
+ default:
+ break;
+ }
+ }
+done_str:
+ return i;
+}
+
+static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+{
+ unsigned n = 0;
+ char c;
+ ssize_t i = 0;
+ int len;
+
+ pkt_dev->nr_labels = 0;
+ do {
+ __u32 tmp;
+ len = hex32_arg(&buffer[i], 8, &tmp);
+ if (len <= 0)
+ return len;
+ pkt_dev->labels[n] = htonl(tmp);
+ if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
+ pkt_dev->flags |= F_MPLS_RND;
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ i++;
+ n++;
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+ } while (c == ',');
+
+ pkt_dev->nr_labels = n;
+ return i;
+}
+
+static ssize_t pktgen_if_write(struct file *file,
+ const char __user * user_buffer, size_t count,
+ loff_t * offset)
+{
+ struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct pktgen_dev *pkt_dev = seq->private;
+ int i = 0, max, len;
+ char name[16], valstr[32];
+ unsigned long value = 0;
+ char *pg_result = NULL;
+ int tmp = 0;
+ char buf[128];
+
+ pg_result = &(pkt_dev->result[0]);
+
+ if (count < 1) {
+ printk(KERN_WARNING "pktgen: wrong command format\n");
+ return -EINVAL;
+ }
+
+ max = count - i;
+ tmp = count_trail_chars(&user_buffer[i], max);
+ if (tmp < 0) {
+ printk(KERN_WARNING "pktgen: illegal format\n");
+ return tmp;
+ }
+ i += tmp;
+
+ /* Read variable name */
+
+ len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ if (len < 0) {
+ return len;
+ }
+ memset(name, 0, sizeof(name));
+ if (copy_from_user(name, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (debug) {
+ char tb[count + 1];
+ if (copy_from_user(tb, user_buffer, count))
+ return -EFAULT;
+ tb[count] = 0;
+ printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name,
+ (unsigned long)count, tb);
+ }
+
+ if (!strcmp(name, "min_pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: min_pkt_size=%u",
+ pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "max_pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->max_pkt_size) {
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: max_pkt_size=%u",
+ pkt_dev->max_pkt_size);
+ return count;
+ }
+
+ /* Shortcut for min = max */
+
+ if (!strcmp(name, "pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "debug")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ debug = value;
+ sprintf(pg_result, "OK: debug=%u", debug);
+ return count;
+ }
+
+ if (!strcmp(name, "frags")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ pkt_dev->nfrags = value;
+ sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ return count;
+ }
+ if (!strcmp(name, "delay")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value == 0x7FFFFFFF) {
+ pkt_dev->delay_us = 0x7FFFFFFF;
+ pkt_dev->delay_ns = 0;
+ } else {
+ pkt_dev->delay_us = value / 1000;
+ pkt_dev->delay_ns = value % 1000;
+ }
+ sprintf(pg_result, "OK: delay=%u",
+ 1000 * pkt_dev->delay_us + pkt_dev->delay_ns);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value != pkt_dev->udp_src_min) {
+ pkt_dev->udp_src_min = value;
+ pkt_dev->cur_udp_src = value;
+ }
+ sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min);
+ return count;
+ }
+ if (!strcmp(name, "udp_dst_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value != pkt_dev->udp_dst_min) {
+ pkt_dev->udp_dst_min = value;
+ pkt_dev->cur_udp_dst = value;
+ }
+ sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_max")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value != pkt_dev->udp_src_max) {
+ pkt_dev->udp_src_max = value;
+ pkt_dev->cur_udp_src = value;
+ }
+ sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max);
+ return count;
+ }
+ if (!strcmp(name, "udp_dst_max")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value != pkt_dev->udp_dst_max) {
+ pkt_dev->udp_dst_max = value;
+ pkt_dev->cur_udp_dst = value;
+ }
+ sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max);
+ return count;
+ }
+ if (!strcmp(name, "clone_skb")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ pkt_dev->clone_skb = value;
+
+ sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
+ return count;
+ }
+ if (!strcmp(name, "count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ pkt_dev->count = value;
+ sprintf(pg_result, "OK: count=%llu",
+ (unsigned long long)pkt_dev->count);
+ return count;
+ }
+ if (!strcmp(name, "src_mac_count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (pkt_dev->src_mac_count != value) {
+ pkt_dev->src_mac_count = value;
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: src_mac_count=%d",
+ pkt_dev->src_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "dst_mac_count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (pkt_dev->dst_mac_count != value) {
+ pkt_dev->dst_mac_count = value;
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: dst_mac_count=%d",
+ pkt_dev->dst_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "flag")) {
+ char f[32];
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0) {
+ return len;
+ }
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+ if (strcmp(f, "IPSRC_RND") == 0)
+ pkt_dev->flags |= F_IPSRC_RND;
+
+ else if (strcmp(f, "!IPSRC_RND") == 0)
+ pkt_dev->flags &= ~F_IPSRC_RND;
+
+ else if (strcmp(f, "TXSIZE_RND") == 0)
+ pkt_dev->flags |= F_TXSIZE_RND;
+
+ else if (strcmp(f, "!TXSIZE_RND") == 0)
+ pkt_dev->flags &= ~F_TXSIZE_RND;
+
+ else if (strcmp(f, "IPDST_RND") == 0)
+ pkt_dev->flags |= F_IPDST_RND;
+
+ else if (strcmp(f, "!IPDST_RND") == 0)
+ pkt_dev->flags &= ~F_IPDST_RND;
+
+ else if (strcmp(f, "UDPSRC_RND") == 0)
+ pkt_dev->flags |= F_UDPSRC_RND;
+
+ else if (strcmp(f, "!UDPSRC_RND") == 0)
+ pkt_dev->flags &= ~F_UDPSRC_RND;
+
+ else if (strcmp(f, "UDPDST_RND") == 0)
+ pkt_dev->flags |= F_UDPDST_RND;
+
+ else if (strcmp(f, "!UDPDST_RND") == 0)
+ pkt_dev->flags &= ~F_UDPDST_RND;
+
+ else if (strcmp(f, "MACSRC_RND") == 0)
+ pkt_dev->flags |= F_MACSRC_RND;
+
+ else if (strcmp(f, "!MACSRC_RND") == 0)
+ pkt_dev->flags &= ~F_MACSRC_RND;
+
+ else if (strcmp(f, "MACDST_RND") == 0)
+ pkt_dev->flags |= F_MACDST_RND;
+
+ else if (strcmp(f, "!MACDST_RND") == 0)
+ pkt_dev->flags &= ~F_MACDST_RND;
+
+ else if (strcmp(f, "MPLS_RND") == 0)
+ pkt_dev->flags |= F_MPLS_RND;
+
+ else if (strcmp(f, "!MPLS_RND") == 0)
+ pkt_dev->flags &= ~F_MPLS_RND;
+
+ else if (strcmp(f, "VID_RND") == 0)
+ pkt_dev->flags |= F_VID_RND;
+
+ else if (strcmp(f, "!VID_RND") == 0)
+ pkt_dev->flags &= ~F_VID_RND;
+
+ else if (strcmp(f, "SVID_RND") == 0)
+ pkt_dev->flags |= F_SVID_RND;
+
+ else if (strcmp(f, "!SVID_RND") == 0)
+ pkt_dev->flags &= ~F_SVID_RND;
+
+ else if (strcmp(f, "FLOW_SEQ") == 0)
+ pkt_dev->flags |= F_FLOW_SEQ;
+
+ else if (strcmp(f, "QUEUE_MAP_RND") == 0)
+ pkt_dev->flags |= F_QUEUE_MAP_RND;
+
+ else if (strcmp(f, "!QUEUE_MAP_RND") == 0)
+ pkt_dev->flags &= ~F_QUEUE_MAP_RND;
+
+ else if (strcmp(f, "QUEUE_MAP_CPU") == 0)
+ pkt_dev->flags |= F_QUEUE_MAP_CPU;
+
+ else if (strcmp(f, "!QUEUE_MAP_CPU") == 0)
+ pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
+#ifdef CONFIG_XFRM
+ else if (strcmp(f, "IPSEC") == 0)
+ pkt_dev->flags |= F_IPSEC_ON;
+#endif
+
+ else if (strcmp(f, "!IPV6") == 0)
+ pkt_dev->flags &= ~F_IPV6;
+
+ else {
+ sprintf(pg_result,
+ "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
+ f,
+ "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+ return count;
+ }
+ if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+ if (len < 0) {
+ return len;
+ }
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_min) != 0) {
+ memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
+ strncpy(pkt_dev->dst_min, buf, len);
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ }
+ if (debug)
+ printk(KERN_DEBUG "pktgen: dst_min set to: %s\n",
+ pkt_dev->dst_min);
+ i += len;
+ sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
+ return count;
+ }
+ if (!strcmp(name, "dst_max")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+ if (len < 0) {
+ return len;
+ }
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_max) != 0) {
+ memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
+ strncpy(pkt_dev->dst_max, buf, len);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ pkt_dev->cur_daddr = pkt_dev->daddr_max;
+ }
+ if (debug)
+ printk(KERN_DEBUG "pktgen: dst_max set to: %s\n",
+ pkt_dev->dst_max);
+ i += len;
+ sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
+ return count;
+ }
+ if (!strcmp(name, "dst6")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+ fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+
+ ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "dst6_min")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+ fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+
+ ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
+ &pkt_dev->min_in6_daddr);
+ if (debug)
+ printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6_min=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "dst6_max")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+ fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6_max=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "src6")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+ fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+
+ ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: src6=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "src_min")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+ if (len < 0) {
+ return len;
+ }
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_min) != 0) {
+ memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
+ strncpy(pkt_dev->src_min, buf, len);
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ }
+ if (debug)
+ printk(KERN_DEBUG "pktgen: src_min set to: %s\n",
+ pkt_dev->src_min);
+ i += len;
+ sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
+ return count;
+ }
+ if (!strcmp(name, "src_max")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+ if (len < 0) {
+ return len;
+ }
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_max) != 0) {
+ memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
+ strncpy(pkt_dev->src_max, buf, len);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ pkt_dev->cur_saddr = pkt_dev->saddr_max;
+ }
+ if (debug)
+ printk(KERN_DEBUG "pktgen: src_max set to: %s\n",
+ pkt_dev->src_max);
+ i += len;
+ sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
+ return count;
+ }
+ if (!strcmp(name, "dst_mac")) {
+ char *v = valstr;
+ unsigned char old_dmac[ETH_ALEN];
+ unsigned char *m = pkt_dev->dst_mac;
+ memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN);
+
+ len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ if (len < 0) {
+ return len;
+ }
+ memset(valstr, 0, sizeof(valstr));
+ if (copy_from_user(valstr, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) {
+ if (*v >= '0' && *v <= '9') {
+ *m *= 16;
+ *m += *v - '0';
+ }
+ if (*v >= 'A' && *v <= 'F') {
+ *m *= 16;
+ *m += *v - 'A' + 10;
+ }
+ if (*v >= 'a' && *v <= 'f') {
+ *m *= 16;
+ *m += *v - 'a' + 10;
+ }
+ if (*v == ':') {
+ m++;
+ *m = 0;
+ }
+ }
+
+ /* Set up Dest MAC */
+ if (compare_ether_addr(old_dmac, pkt_dev->dst_mac))
+ memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
+
+ sprintf(pg_result, "OK: dstmac");
+ return count;
+ }
+ if (!strcmp(name, "src_mac")) {
+ char *v = valstr;
+ unsigned char old_smac[ETH_ALEN];
+ unsigned char *m = pkt_dev->src_mac;
+
+ memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN);
+
+ len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ if (len < 0) {
+ return len;
+ }
+ memset(valstr, 0, sizeof(valstr));
+ if (copy_from_user(valstr, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) {
+ if (*v >= '0' && *v <= '9') {
+ *m *= 16;
+ *m += *v - '0';
+ }
+ if (*v >= 'A' && *v <= 'F') {
+ *m *= 16;
+ *m += *v - 'A' + 10;
+ }
+ if (*v >= 'a' && *v <= 'f') {
+ *m *= 16;
+ *m += *v - 'a' + 10;
+ }
+ if (*v == ':') {
+ m++;
+ *m = 0;
+ }
+ }
+
+ /* Set up Src MAC */
+ if (compare_ether_addr(old_smac, pkt_dev->src_mac))
+ memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN);
+
+ sprintf(pg_result, "OK: srcmac");
+ return count;
+ }
+
+ if (!strcmp(name, "clear_counters")) {
+ pktgen_clear_counters(pkt_dev);
+ sprintf(pg_result, "OK: Clearing counters.\n");
+ return count;
+ }
+
+ if (!strcmp(name, "flows")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value > MAX_CFLOWS)
+ value = MAX_CFLOWS;
+
+ pkt_dev->cflows = value;
+ sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
+ return count;
+ }
+
+ if (!strcmp(name, "flowlen")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ pkt_dev->lflow = value;
+ sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_min")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ pkt_dev->queue_map_min = value;
+ sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_max")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ pkt_dev->queue_map_max = value;
+ sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
+ return count;
+ }
+
+ if (!strcmp(name, "mpls")) {
+ unsigned n, cnt;
+
+ len = get_labels(&user_buffer[i], pkt_dev);
+ if (len < 0)
+ return len;
+ i += len;
+ cnt = sprintf(pg_result, "OK: mpls=");
+ for (n = 0; n < pkt_dev->nr_labels; n++)
+ cnt += sprintf(pg_result + cnt,
+ "%08x%s", ntohl(pkt_dev->labels[n]),
+ n == pkt_dev->nr_labels-1 ? "" : ",");
+
+ if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: VLAN/SVLAN auto turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (value <= 4095) {
+ pkt_dev->vlan_id = value; /* turn on VLAN */
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: VLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ printk(KERN_DEBUG "pktgen: MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_p = value;
+ sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_cfi = value;
+ sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
+ pkt_dev->svlan_id = value; /* turn on SVLAN */
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: SVLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ printk(KERN_DEBUG "pktgen: MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_p = value;
+ sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_cfi = value;
+ sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "tos")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (len == 2) {
+ pkt_dev->tos = tmp_value;
+ sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
+ } else {
+ sprintf(pg_result, "ERROR: tos must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "traffic_class")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0) {
+ return len;
+ }
+ i += len;
+ if (len == 2) {
+ pkt_dev->traffic_class = tmp_value;
+ sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
+ } else {
+ sprintf(pg_result, "ERROR: traffic_class must be 00-ff");
+ }
+ return count;
+ }
+
+ sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
+ return -EINVAL;
+}
+
+static int pktgen_if_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pktgen_if_show, PDE(inode)->data);
+}
+
+static const struct file_operations pktgen_if_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_if_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_if_write,
+ .release = single_release,
+};
+
+static int pktgen_thread_show(struct seq_file *seq, void *v)
+{
+ struct pktgen_thread *t = seq->private;
+ struct pktgen_dev *pkt_dev;
+
+ BUG_ON(!t);
+
+ seq_printf(seq, "Running: ");
+
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odev->name);
+
+ seq_printf(seq, "\nStopped: ");
+
+ list_for_each_entry(pkt_dev, &t->if_list, list)
+ if (!pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odev->name);
+
+ if (t->result[0])
+ seq_printf(seq, "\nResult: %s\n", t->result);
+ else
+ seq_printf(seq, "\nResult: NA\n");
+
+ if_unlock(t);
+
+ return 0;
+}
+
+static ssize_t pktgen_thread_write(struct file *file,
+ const char __user * user_buffer,
+ size_t count, loff_t * offset)
+{
+ struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct pktgen_thread *t = seq->private;
+ int i = 0, max, len, ret;
+ char name[40];
+ char *pg_result;
+
+ if (count < 1) {
+ // sprintf(pg_result, "Wrong command format");
+ return -EINVAL;
+ }
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ /* Read variable name */
+
+ len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ if (len < 0)
+ return len;
+
+ memset(name, 0, sizeof(name));
+ if (copy_from_user(name, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (debug)
+ printk(KERN_DEBUG "pktgen: t=%s, count=%lu\n",
+ name, (unsigned long)count);
+
+ if (!t) {
+ printk(KERN_ERR "pktgen: ERROR: No thread\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pg_result = &(t->result[0]);
+
+ if (!strcmp(name, "add_device")) {
+ char f[32];
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0) {
+ ret = len;
+ goto out;
+ }
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+ mutex_lock(&pktgen_thread_lock);
+ pktgen_add_device(t, f);
+ mutex_unlock(&pktgen_thread_lock);
+ ret = count;
+ sprintf(pg_result, "OK: add_device=%s", f);
+ goto out;
+ }
+
+ if (!strcmp(name, "rem_device_all")) {
+ mutex_lock(&pktgen_thread_lock);
+ t->control |= T_REMDEVALL;
+ mutex_unlock(&pktgen_thread_lock);
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
+ ret = count;
+ sprintf(pg_result, "OK: rem_device_all");
+ goto out;
+ }
+
+ if (!strcmp(name, "max_before_softirq")) {
+ sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use");
+ ret = count;
+ goto out;
+ }
+
+ ret = -EINVAL;
+out:
+ return ret;
+}
+
+static int pktgen_thread_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pktgen_thread_show, PDE(inode)->data);
+}
+
+static const struct file_operations pktgen_thread_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_thread_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_thread_write,
+ .release = single_release,
+};
+
+/* Think find or remove for NN */
+static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)
+{
+ struct pktgen_thread *t;
+ struct pktgen_dev *pkt_dev = NULL;
+
+ list_for_each_entry(t, &pktgen_threads, th_list) {
+ pkt_dev = pktgen_find_dev(t, ifname);
+ if (pkt_dev) {
+ if (remove) {
+ if_lock(t);
+ pkt_dev->removal_mark = 1;
+ t->control |= T_REMDEV;
+ if_unlock(t);
+ }
+ break;
+ }
+ }
+ return pkt_dev;
+}
+
+/*
+ * mark a device for removal
+ */
+static void pktgen_mark_device(const char *ifname)
+{
+ struct pktgen_dev *pkt_dev = NULL;
+ const int max_tries = 10, msec_per_try = 125;
+ int i = 0;
+
+ mutex_lock(&pktgen_thread_lock);
+ pr_debug("pktgen: pktgen_mark_device marking %s for removal\n", ifname);
+
+ while (1) {
+
+ pkt_dev = __pktgen_NN_threads(ifname, REMOVE);
+ if (pkt_dev == NULL)
+ break; /* success */
+
+ mutex_unlock(&pktgen_thread_lock);
+ pr_debug("pktgen: pktgen_mark_device waiting for %s "
+ "to disappear....\n", ifname);
+ schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+ mutex_lock(&pktgen_thread_lock);
+
+ if (++i >= max_tries) {
+ printk(KERN_ERR "pktgen_mark_device: timed out after "
+ "waiting %d msec for device %s to be removed\n",
+ msec_per_try * i, ifname);
+ break;
+ }
+
+ }
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static void pktgen_change_name(struct net_device *dev)
+{
+ struct pktgen_thread *t;
+
+ list_for_each_entry(t, &pktgen_threads, th_list) {
+ struct pktgen_dev *pkt_dev;
+
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ if (pkt_dev->odev != dev)
+ continue;
+
+ remove_proc_entry(pkt_dev->entry->name, pg_proc_dir);
+
+ pkt_dev->entry = create_proc_entry(dev->name, 0600,
+ pg_proc_dir);
+ if (!pkt_dev->entry)
+ printk(KERN_ERR "pktgen: can't move proc "
+ " entry for '%s'\n", dev->name);
+ break;
+ }
+ }
+}
+
+static int pktgen_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ /* It is OK that we do not hold the group lock right now,
+ * as we run under the RTNL lock.
+ */
+
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ pktgen_change_name(dev);
+ break;
+
+ case NETDEV_UNREGISTER:
+ pktgen_mark_device(dev->name);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, const char *ifname)
+{
+ char b[IFNAMSIZ+5];
+ int i = 0;
+
+ for(i=0; ifname[i] != '@'; i++) {
+ if(i == IFNAMSIZ)
+ break;
+
+ b[i] = ifname[i];
+ }
+ b[i] = 0;
+
+ return dev_get_by_name(&init_net, b);
+}
+
+
+/* Associate pktgen_dev with a device. */
+
+static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)
+{
+ struct net_device *odev;
+ int err;
+
+ /* Clean old setups */
+ if (pkt_dev->odev) {
+ dev_put(pkt_dev->odev);
+ pkt_dev->odev = NULL;
+ }
+
+ odev = pktgen_dev_get_by_name(pkt_dev, ifname);
+ if (!odev) {
+ printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", ifname);
+ return -ENODEV;
+ }
+
+ if (odev->type != ARPHRD_ETHER) {
+ printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", ifname);
+ err = -EINVAL;
+ } else if (!netif_running(odev)) {
+ printk(KERN_ERR "pktgen: device is down: \"%s\"\n", ifname);
+ err = -ENETDOWN;
+ } else {
+ pkt_dev->odev = odev;
+ return 0;
+ }
+
+ dev_put(odev);
+ return err;
+}
+
+/* Read pkt_dev from the interface and set up internal pktgen_dev
+ * structure to have the right information to create/send packets
+ */
+static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
+{
+ int ntxq;
+
+ if (!pkt_dev->odev) {
+ printk(KERN_ERR "pktgen: ERROR: pkt_dev->odev == NULL in "
+ "setup_inject.\n");
+ sprintf(pkt_dev->result,
+ "ERROR: pkt_dev->odev == NULL in setup_inject.\n");
+ return;
+ }
+
+ /* make sure that we don't pick a non-existing transmit queue */
+ ntxq = pkt_dev->odev->real_num_tx_queues;
+
+ if (ntxq <= pkt_dev->queue_map_min) {
+ printk(KERN_WARNING "pktgen: WARNING: Requested "
+ "queue_map_min (zero-based) (%d) exceeds valid range "
+ "[0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1)- 1, ntxq,
+ pkt_dev->odev->name);
+ pkt_dev->queue_map_min = ntxq - 1;
+ }
+ if (pkt_dev->queue_map_max >= ntxq) {
+ printk(KERN_WARNING "pktgen: WARNING: Requested "
+ "queue_map_max (zero-based) (%d) exceeds valid range "
+ "[0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1)- 1, ntxq,
+ pkt_dev->odev->name);
+ pkt_dev->queue_map_max = ntxq - 1;
+ }
+
+ /* Default to the interface's mac if not explicitly set. */
+
+ if (is_zero_ether_addr(pkt_dev->src_mac))
+ memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN);
+
+ /* Set up Dest MAC */
+ memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
+
+ /* Set up pkt size */
+ pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+
+ if (pkt_dev->flags & F_IPV6) {
+ /*
+ * Skip this automatic address setting until locks or functions
+ * gets exported
+ */
+
+#ifdef NOTNOW
+ int i, set = 0, err = 1;
+ struct inet6_dev *idev;
+
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
+ set = 1;
+ break;
+ }
+
+ if (!set) {
+
+ /*
+ * Use linklevel address if unconfigured.
+ *
+ * use ipv6_get_lladdr if/when it's get exported
+ */
+
+ rcu_read_lock();
+ if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ for (ifp = idev->addr_list; ifp;
+ ifp = ifp->if_next) {
+ if (ifp->scope == IFA_LINK
+ && !(ifp->
+ flags & IFA_F_TENTATIVE)) {
+ ipv6_addr_copy(&pkt_dev->
+ cur_in6_saddr,
+ &ifp->addr);
+ err = 0;
+ break;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ if (err)
+ printk(KERN_ERR "pktgen: ERROR: IPv6 link "
+ "address not availble.\n");
+ }
+#endif
+ } else {
+ pkt_dev->saddr_min = 0;
+ pkt_dev->saddr_max = 0;
+ if (strlen(pkt_dev->src_min) == 0) {
+
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(pkt_dev->odev);
+ if (in_dev) {
+ if (in_dev->ifa_list) {
+ pkt_dev->saddr_min =
+ in_dev->ifa_list->ifa_address;
+ pkt_dev->saddr_max = pkt_dev->saddr_min;
+ }
+ }
+ rcu_read_unlock();
+ } else {
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ }
+
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ }
+ /* Initialize current values. */
+ pkt_dev->cur_dst_mac_offset = 0;
+ pkt_dev->cur_src_mac_offset = 0;
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ pkt_dev->nflows = 0;
+}
+
+static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us)
+{
+ __u64 start;
+ __u64 now;
+
+ start = now = getCurUs();
+ while (now < spin_until_us) {
+ /* TODO: optimize sleeping behavior */
+ if (spin_until_us - now > jiffies_to_usecs(1) + 1)
+ schedule_timeout_interruptible(1);
+ else if (spin_until_us - now > 100) {
+ if (!pkt_dev->running)
+ return;
+ if (need_resched())
+ schedule();
+ }
+
+ now = getCurUs();
+ }
+
+ pkt_dev->idle_acc += now - start;
+}
+
+static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
+ pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
+ pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
+}
+
+static inline int f_seen(struct pktgen_dev *pkt_dev, int flow)
+{
+
+ if (pkt_dev->flows[flow].flags & F_INIT)
+ return 1;
+ else
+ return 0;
+}
+
+static inline int f_pick(struct pktgen_dev *pkt_dev)
+{
+ int flow = pkt_dev->curfl;
+
+ if (pkt_dev->flags & F_FLOW_SEQ) {
+ if (pkt_dev->flows[flow].count >= pkt_dev->lflow) {
+ /* reset time */
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ pkt_dev->curfl += 1;
+ if (pkt_dev->curfl >= pkt_dev->cflows)
+ pkt_dev->curfl = 0; /*reset */
+ }
+ } else {
+ flow = random32() % pkt_dev->cflows;
+ pkt_dev->curfl = flow;
+
+ if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ }
+ }
+
+ return pkt_dev->curfl;
+}
+
+
+#ifdef CONFIG_XFRM
+/* If there was already an IPSEC SA, we keep it as is, else
+ * we go look for it ...
+*/
+static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
+{
+ struct xfrm_state *x = pkt_dev->flows[flow].x;
+ if (!x) {
+ /*slow path: we dont already have xfrm_state*/
+ x = xfrm_stateonly_find((xfrm_address_t *)&pkt_dev->cur_daddr,
+ (xfrm_address_t *)&pkt_dev->cur_saddr,
+ AF_INET,
+ pkt_dev->ipsmode,
+ pkt_dev->ipsproto, 0);
+ if (x) {
+ pkt_dev->flows[flow].x = x;
+ set_pkt_overhead(pkt_dev);
+ pkt_dev->pkt_overhead+=x->props.header_len;
+ }
+
+ }
+}
+#endif
+static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
+{
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ pkt_dev->cur_queue_map = smp_processor_id();
+
+ else if (pkt_dev->queue_map_min < pkt_dev->queue_map_max) {
+ __u16 t;
+ if (pkt_dev->flags & F_QUEUE_MAP_RND) {
+ t = random32() %
+ (pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1)
+ + pkt_dev->queue_map_min;
+ } else {
+ t = pkt_dev->cur_queue_map + 1;
+ if (t > pkt_dev->queue_map_max)
+ t = pkt_dev->queue_map_min;
+ }
+ pkt_dev->cur_queue_map = t;
+ }
+ pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues;
+}
+
+/* Increment/randomize headers according to flags and current values
+ * for IP src/dest, UDP src/dst port, MAC-Addr src/dst
+ */
+static void mod_cur_headers(struct pktgen_dev *pkt_dev)
+{
+ __u32 imn;
+ __u32 imx;
+ int flow = 0;
+
+ if (pkt_dev->cflows)
+ flow = f_pick(pkt_dev);
+
+ /* Deal with source MAC */
+ if (pkt_dev->src_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ mc = random32() % pkt_dev->src_mac_count;
+ else {
+ mc = pkt_dev->cur_src_mac_offset++;
+ if (pkt_dev->cur_src_mac_offset >=
+ pkt_dev->src_mac_count)
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+
+ tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[11] = tmp;
+ tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[10] = tmp;
+ tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[9] = tmp;
+ tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[8] = tmp;
+ tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
+ pkt_dev->hh[7] = tmp;
+ }
+
+ /* Deal with Destination MAC */
+ if (pkt_dev->dst_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ mc = random32() % pkt_dev->dst_mac_count;
+
+ else {
+ mc = pkt_dev->cur_dst_mac_offset++;
+ if (pkt_dev->cur_dst_mac_offset >=
+ pkt_dev->dst_mac_count) {
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ }
+
+ tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[5] = tmp;
+ tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[4] = tmp;
+ tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[3] = tmp;
+ tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[2] = tmp;
+ tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
+ pkt_dev->hh[1] = tmp;
+ }
+
+ if (pkt_dev->flags & F_MPLS_RND) {
+ unsigned i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
+ pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
+ ((__force __be32)random32() &
+ htonl(0x000fffff));
+ }
+
+ if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_id = random32() & (4096-1);
+ }
+
+ if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_id = random32() & (4096 - 1);
+ }
+
+ if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ pkt_dev->cur_udp_src = random32() %
+ (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
+ + pkt_dev->udp_src_min;
+
+ else {
+ pkt_dev->cur_udp_src++;
+ if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max)
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ }
+ }
+
+ if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
+ if (pkt_dev->flags & F_UDPDST_RND) {
+ pkt_dev->cur_udp_dst = random32() %
+ (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
+ + pkt_dev->udp_dst_min;
+ } else {
+ pkt_dev->cur_udp_dst++;
+ if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ }
+ }
+
+ if (!(pkt_dev->flags & F_IPV6)) {
+
+ if ((imn = ntohl(pkt_dev->saddr_min)) < (imx =
+ ntohl(pkt_dev->
+ saddr_max))) {
+ __u32 t;
+ if (pkt_dev->flags & F_IPSRC_RND)
+ t = random32() % (imx - imn) + imn;
+ else {
+ t = ntohl(pkt_dev->cur_saddr);
+ t++;
+ if (t > imx) {
+ t = imn;
+ }
+ }
+ pkt_dev->cur_saddr = htonl(t);
+ }
+
+ if (pkt_dev->cflows && f_seen(pkt_dev, flow)) {
+ pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
+ } else {
+ imn = ntohl(pkt_dev->daddr_min);
+ imx = ntohl(pkt_dev->daddr_max);
+ if (imn < imx) {
+ __u32 t;
+ __be32 s;
+ if (pkt_dev->flags & F_IPDST_RND) {
+
+ t = random32() % (imx - imn) + imn;
+ s = htonl(t);
+
+ while (ipv4_is_loopback(s) ||
+ ipv4_is_multicast(s) ||
+ ipv4_is_lbcast(s) ||
+ ipv4_is_zeronet(s) ||
+ ipv4_is_local_multicast(s)) {
+ t = random32() % (imx - imn) + imn;
+ s = htonl(t);
+ }
+ pkt_dev->cur_daddr = s;
+ } else {
+ t = ntohl(pkt_dev->cur_daddr);
+ t++;
+ if (t > imx) {
+ t = imn;
+ }
+ pkt_dev->cur_daddr = htonl(t);
+ }
+ }
+ if (pkt_dev->cflows) {
+ pkt_dev->flows[flow].flags |= F_INIT;
+ pkt_dev->flows[flow].cur_daddr =
+ pkt_dev->cur_daddr;
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC_ON)
+ get_ipsec_sa(pkt_dev, flow);
+#endif
+ pkt_dev->nflows++;
+ }
+ }
+ } else { /* IPV6 * */
+
+ if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
+ pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
+ pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
+ pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ;
+ else {
+ int i;
+
+ /* Only random destinations yet */
+
+ for (i = 0; i < 4; i++) {
+ pkt_dev->cur_in6_daddr.s6_addr32[i] =
+ (((__force __be32)random32() |
+ pkt_dev->min_in6_daddr.s6_addr32[i]) &
+ pkt_dev->max_in6_daddr.s6_addr32[i]);
+ }
+ }
+ }
+
+ if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
+ __u32 t;
+ if (pkt_dev->flags & F_TXSIZE_RND) {
+ t = random32() %
+ (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
+ + pkt_dev->min_pkt_size;
+ } else {
+ t = pkt_dev->cur_pkt_size + 1;
+ if (t > pkt_dev->max_pkt_size)
+ t = pkt_dev->min_pkt_size;
+ }
+ pkt_dev->cur_pkt_size = t;
+ }
+
+ set_cur_queue_map(pkt_dev);
+
+ pkt_dev->flows[flow].count++;
+}
+
+
+#ifdef CONFIG_XFRM
+static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
+{
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int err = 0;
+ struct iphdr *iph;
+
+ if (!x)
+ return 0;
+ /* XXX: we dont support tunnel mode for now until
+ * we resolve the dst issue */
+ if (x->props.mode != XFRM_MODE_TRANSPORT)
+ return 0;
+
+ spin_lock(&x->lock);
+ iph = ip_hdr(skb);
+
+ err = x->outer_mode->output(x, skb);
+ if (err)
+ goto error;
+ err = x->type->output(x, skb);
+ if (err)
+ goto error;
+
+ x->curlft.bytes +=skb->len;
+ x->curlft.packets++;
+error:
+ spin_unlock(&x->lock);
+ return err;
+}
+
+static inline void free_SAs(struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->cflows) {
+ /* let go of the SAs if we have them */
+ int i = 0;
+ for (; i < pkt_dev->nflows; i++){
+ struct xfrm_state *x = pkt_dev->flows[i].x;
+ if (x) {
+ xfrm_state_put(x);
+ pkt_dev->flows[i].x = NULL;
+ }
+ }
+ }
+}
+
+static inline int process_ipsec(struct pktgen_dev *pkt_dev,
+ struct sk_buff *skb, __be16 protocol)
+{
+ if (pkt_dev->flags & F_IPSEC_ON) {
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int nhead = 0;
+ if (x) {
+ int ret;
+ __u8 *eth;
+ nhead = x->props.header_len - skb_headroom(skb);
+ if (nhead >0) {
+ ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ printk(KERN_ERR "Error expanding "
+ "ipsec packet %d\n",ret);
+ goto err;
+ }
+ }
+
+ /* ipsec is not expecting ll header */
+ skb_pull(skb, ETH_HLEN);
+ ret = pktgen_output_ipsec(skb, pkt_dev);
+ if (ret) {
+ printk(KERN_ERR "Error creating ipsec "
+ "packet %d\n",ret);
+ goto err;
+ }
+ /* restore ll */
+ eth = (__u8 *) skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 12);
+ *(u16 *) & eth[12] = protocol;
+ }
+ }
+ return 1;
+err:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
+{
+ unsigned i;
+ for (i = 0; i < pkt_dev->nr_labels; i++) {
+ *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
+ }
+ mpls--;
+ *mpls |= MPLS_STACK_BOTTOM;
+}
+
+static inline __be16 build_tci(unsigned int id, unsigned int cfi,
+ unsigned int prio)
+{
+ return htons(id | (cfi << 12) | (prio << 13));
+}
+
+static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen, iplen;
+ struct iphdr *iph;
+ struct pktgen_hdr *pgh = NULL;
+ __be16 protocol = htons(ETH_P_IP);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ queue_map = pkt_dev->cur_queue_map;
+ mod_cur_headers(pkt_dev);
+
+ datalen = (odev->hard_header_len + 16) & ~0xf;
+ skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + datalen +
+ pkt_dev->pkt_overhead, GFP_ATOMIC);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
+ }
+
+ skb_reserve(skb, datalen);
+
+ /* Reserve for ethernet and IP header */
+ eth = (__u8 *) skb_push(skb, 14);
+ mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IP);
+ }
+
+ skb->network_header = skb->tail;
+ skb->transport_header = skb->network_header + sizeof(struct iphdr);
+ skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ iph = ip_hdr(skb);
+ udph = udp_hdr(skb);
+
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) & eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
+ pkt_dev->pkt_overhead;
+ if (datalen < sizeof(struct pktgen_hdr))
+ datalen = sizeof(struct pktgen_hdr);
+
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(datalen + 8); /* DATA + udphdr */
+ udph->check = 0; /* No checksum */
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 32;
+ iph->tos = pkt_dev->tos;
+ iph->protocol = IPPROTO_UDP; /* UDP */
+ iph->saddr = pkt_dev->cur_saddr;
+ iph->daddr = pkt_dev->cur_daddr;
+ iph->frag_off = 0;
+ iplen = 20 + 8 + datalen;
+ iph->tot_len = htons(iplen);
+ iph->check = 0;
+ iph->check = ip_fast_csum((void *)iph, iph->ihl);
+ skb->protocol = protocol;
+ skb->mac_header = (skb->network_header - ETH_HLEN -
+ pkt_dev->pkt_overhead);
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ if (pkt_dev->nfrags <= 0)
+ pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
+ else {
+ int frags = pkt_dev->nfrags;
+ int i;
+
+ pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ if (datalen > frags * PAGE_SIZE) {
+ skb_put(skb, datalen - frags * PAGE_SIZE);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ while (datalen > 0) {
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
+ skb_shinfo(skb)->frags[i].page = page;
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ skb_shinfo(skb)->frags[i].size =
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
+ datalen -= skb_shinfo(skb)->frags[i].size;
+ skb->len += skb_shinfo(skb)->frags[i].size;
+ skb->data_len += skb_shinfo(skb)->frags[i].size;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+
+ while (i < frags) {
+ int rem;
+
+ if (i == 0)
+ break;
+
+ rem = skb_shinfo(skb)->frags[i - 1].size / 2;
+ if (rem == 0)
+ break;
+
+ skb_shinfo(skb)->frags[i - 1].size -= rem;
+
+ skb_shinfo(skb)->frags[i] =
+ skb_shinfo(skb)->frags[i - 1];
+ get_page(skb_shinfo(skb)->frags[i].page);
+ skb_shinfo(skb)->frags[i].page =
+ skb_shinfo(skb)->frags[i - 1].page;
+ skb_shinfo(skb)->frags[i].page_offset +=
+ skb_shinfo(skb)->frags[i - 1].size;
+ skb_shinfo(skb)->frags[i].size = rem;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number, convert them to network byte order */
+
+ if (pgh) {
+ struct timeval timestamp;
+
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+ }
+
+#ifdef CONFIG_XFRM
+ if (!process_ipsec(pkt_dev, skb, protocol))
+ return NULL;
+#endif
+
+ return skb;
+}
+
+/*
+ * scan_ip6, fmt_ip taken from dietlibc-0.21
+ * Author Felix von Leitner <felix-dietlibc@fefe.de>
+ *
+ * Slightly modified for kernel.
+ * Should be candidate for net/ipv4/utils.c
+ * --ro
+ */
+
+static unsigned int scan_ip6(const char *s, char ip[16])
+{
+ unsigned int i;
+ unsigned int len = 0;
+ unsigned long u;
+ char suffix[16];
+ unsigned int prefixlen = 0;
+ unsigned int suffixlen = 0;
+ __be32 tmp;
+ char *pos;
+
+ for (i = 0; i < 16; i++)
+ ip[i] = 0;
+
+ for (;;) {
+ if (*s == ':') {
+ len++;
+ if (s[1] == ':') { /* Found "::", skip to part 2 */
+ s += 2;
+ len++;
+ break;
+ }
+ s++;
+ }
+
+ u = simple_strtoul(s, &pos, 16);
+ i = pos - s;
+ if (!i)
+ return 0;
+ if (prefixlen == 12 && s[i] == '.') {
+
+ /* the last 4 bytes may be written as IPv4 address */
+
+ tmp = in_aton(s);
+ memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp));
+ return i + len;
+ }
+ ip[prefixlen++] = (u >> 8);
+ ip[prefixlen++] = (u & 255);
+ s += i;
+ len += i;
+ if (prefixlen == 16)
+ return len;
+ }
+
+/* part 2, after "::" */
+ for (;;) {
+ if (*s == ':') {
+ if (suffixlen == 0)
+ break;
+ s++;
+ len++;
+ } else if (suffixlen != 0)
+ break;
+
+ u = simple_strtol(s, &pos, 16);
+ i = pos - s;
+ if (!i) {
+ if (*s)
+ len--;
+ break;
+ }
+ if (suffixlen + prefixlen <= 12 && s[i] == '.') {
+ tmp = in_aton(s);
+ memcpy((struct in_addr *)(suffix + suffixlen), &tmp,
+ sizeof(tmp));
+ suffixlen += 4;
+ len += strlen(s);
+ break;
+ }
+ suffix[suffixlen++] = (u >> 8);
+ suffix[suffixlen++] = (u & 255);
+ s += i;
+ len += i;
+ if (prefixlen + suffixlen == 16)
+ break;
+ }
+ for (i = 0; i < suffixlen; i++)
+ ip[16 - suffixlen + i] = suffix[i];
+ return len;
+}
+
+static char tohex(char hexdigit)
+{
+ return hexdigit > 9 ? hexdigit + 'a' - 10 : hexdigit + '0';
+}
+
+static int fmt_xlong(char *s, unsigned int i)
+{
+ char *bak = s;
+ *s = tohex((i >> 12) & 0xf);
+ if (s != bak || *s != '0')
+ ++s;
+ *s = tohex((i >> 8) & 0xf);
+ if (s != bak || *s != '0')
+ ++s;
+ *s = tohex((i >> 4) & 0xf);
+ if (s != bak || *s != '0')
+ ++s;
+ *s = tohex(i & 0xf);
+ return s - bak + 1;
+}
+
+static unsigned int fmt_ip6(char *s, const char ip[16])
+{
+ unsigned int len;
+ unsigned int i;
+ unsigned int temp;
+ unsigned int compressing;
+ int j;
+
+ len = 0;
+ compressing = 0;
+ for (j = 0; j < 16; j += 2) {
+
+#ifdef V4MAPPEDPREFIX
+ if (j == 12 && !memcmp(ip, V4mappedprefix, 12)) {
+ inet_ntoa_r(*(struct in_addr *)(ip + 12), s);
+ temp = strlen(s);
+ return len + temp;
+ }
+#endif
+ temp = ((unsigned long)(unsigned char)ip[j] << 8) +
+ (unsigned long)(unsigned char)ip[j + 1];
+ if (temp == 0) {
+ if (!compressing) {
+ compressing = 1;
+ if (j == 0) {
+ *s++ = ':';
+ ++len;
+ }
+ }
+ } else {
+ if (compressing) {
+ compressing = 0;
+ *s++ = ':';
+ ++len;
+ }
+ i = fmt_xlong(s, temp);
+ len += i;
+ s += i;
+ if (j < 14) {
+ *s++ = ':';
+ ++len;
+ }
+ }
+ }
+ if (compressing) {
+ *s++ = ':';
+ ++len;
+ }
+ *s = 0;
+ return len;
+}
+
+static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen;
+ struct ipv6hdr *iph;
+ struct pktgen_hdr *pgh = NULL;
+ __be16 protocol = htons(ETH_P_IPV6);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ queue_map = pkt_dev->cur_queue_map;
+ mod_cur_headers(pkt_dev);
+
+ skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16 +
+ pkt_dev->pkt_overhead, GFP_ATOMIC);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
+ }
+
+ skb_reserve(skb, 16);
+
+ /* Reserve for ethernet and IP header */
+ eth = (__u8 *) skb_push(skb, 14);
+ mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IPV6);
+ }
+
+ skb->network_header = skb->tail;
+ skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
+ skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ iph = ipv6_hdr(skb);
+ udph = udp_hdr(skb);
+
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) & eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 -
+ sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
+ pkt_dev->pkt_overhead;
+
+ if (datalen < sizeof(struct pktgen_hdr)) {
+ datalen = sizeof(struct pktgen_hdr);
+ if (net_ratelimit())
+ printk(KERN_INFO "pktgen: increased datalen to %d\n",
+ datalen);
+ }
+
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(datalen + sizeof(struct udphdr));
+ udph->check = 0; /* No checksum */
+
+ *(__be32 *) iph = htonl(0x60000000); /* Version + flow */
+
+ if (pkt_dev->traffic_class) {
+ /* Version + traffic class + flow (0) */
+ *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20));
+ }
+
+ iph->hop_limit = 32;
+
+ iph->payload_len = htons(sizeof(struct udphdr) + datalen);
+ iph->nexthdr = IPPROTO_UDP;
+
+ ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
+ ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+
+ skb->mac_header = (skb->network_header - ETH_HLEN -
+ pkt_dev->pkt_overhead);
+ skb->protocol = protocol;
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ if (pkt_dev->nfrags <= 0)
+ pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
+ else {
+ int frags = pkt_dev->nfrags;
+ int i;
+
+ pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ if (datalen > frags * PAGE_SIZE) {
+ skb_put(skb, datalen - frags * PAGE_SIZE);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ while (datalen > 0) {
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
+ skb_shinfo(skb)->frags[i].page = page;
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ skb_shinfo(skb)->frags[i].size =
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
+ datalen -= skb_shinfo(skb)->frags[i].size;
+ skb->len += skb_shinfo(skb)->frags[i].size;
+ skb->data_len += skb_shinfo(skb)->frags[i].size;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+
+ while (i < frags) {
+ int rem;
+
+ if (i == 0)
+ break;
+
+ rem = skb_shinfo(skb)->frags[i - 1].size / 2;
+ if (rem == 0)
+ break;
+
+ skb_shinfo(skb)->frags[i - 1].size -= rem;
+
+ skb_shinfo(skb)->frags[i] =
+ skb_shinfo(skb)->frags[i - 1];
+ get_page(skb_shinfo(skb)->frags[i].page);
+ skb_shinfo(skb)->frags[i].page =
+ skb_shinfo(skb)->frags[i - 1].page;
+ skb_shinfo(skb)->frags[i].page_offset +=
+ skb_shinfo(skb)->frags[i - 1].size;
+ skb_shinfo(skb)->frags[i].size = rem;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number, convert them to network byte order */
+ /* should we update cloned packets too ? */
+ if (pgh) {
+ struct timeval timestamp;
+
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+ }
+ /* pkt_dev->seq_num++; FF: you really mean this? */
+
+ return skb;
+}
+
+static inline struct sk_buff *fill_packet(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->flags & F_IPV6)
+ return fill_packet_ipv6(odev, pkt_dev);
+ else
+ return fill_packet_ipv4(odev, pkt_dev);
+}
+
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->seq_num = 1;
+ pkt_dev->idle_acc = 0;
+ pkt_dev->sofar = 0;
+ pkt_dev->tx_bytes = 0;
+ pkt_dev->errors = 0;
+}
+
+/* Set up structure for sending pkts, clear counters */
+
+static void pktgen_run(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+ int started = 0;
+
+ pr_debug("pktgen: entering pktgen_run. %p\n", t);
+
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+
+ /*
+ * setup odev and create initial packet.
+ */
+ pktgen_setup_inject(pkt_dev);
+
+ if (pkt_dev->odev) {
+ pktgen_clear_counters(pkt_dev);
+ pkt_dev->running = 1; /* Cranke yeself! */
+ pkt_dev->skb = NULL;
+ pkt_dev->started_at = getCurUs();
+ pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */
+ pkt_dev->next_tx_ns = 0;
+ set_pkt_overhead(pkt_dev);
+
+ strcpy(pkt_dev->result, "Starting");
+ started++;
+ } else
+ strcpy(pkt_dev->result, "Error starting");
+ }
+ if_unlock(t);
+ if (started)
+ t->control &= ~(T_STOP);
+}
+
+static void pktgen_stop_all_threads_ifs(void)
+{
+ struct pktgen_thread *t;
+
+ pr_debug("pktgen: entering pktgen_stop_all_threads_ifs.\n");
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pktgen_threads, th_list)
+ t->control |= T_STOP;
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static int thread_is_running(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+ int res = 0;
+
+ list_for_each_entry(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running) {
+ res = 1;
+ break;
+ }
+ return res;
+}
+
+static int pktgen_wait_thread_run(struct pktgen_thread *t)
+{
+ if_lock(t);
+
+ while (thread_is_running(t)) {
+
+ if_unlock(t);
+
+ msleep_interruptible(100);
+
+ if (signal_pending(current))
+ goto signal;
+ if_lock(t);
+ }
+ if_unlock(t);
+ return 1;
+signal:
+ return 0;
+}
+
+static int pktgen_wait_all_threads_run(void)
+{
+ struct pktgen_thread *t;
+ int sig = 1;
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pktgen_threads, th_list) {
+ sig = pktgen_wait_thread_run(t);
+ if (sig == 0)
+ break;
+ }
+
+ if (sig == 0)
+ list_for_each_entry(t, &pktgen_threads, th_list)
+ t->control |= (T_STOP);
+
+ mutex_unlock(&pktgen_thread_lock);
+ return sig;
+}
+
+static void pktgen_run_all_threads(void)
+{
+ struct pktgen_thread *t;
+
+ pr_debug("pktgen: entering pktgen_run_all_threads.\n");
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pktgen_threads, th_list)
+ t->control |= (T_RUN);
+
+ mutex_unlock(&pktgen_thread_lock);
+
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
+
+ pktgen_wait_all_threads_run();
+}
+
+static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
+{
+ __u64 total_us, bps, mbps, pps, idle;
+ char *p = pkt_dev->result;
+
+ total_us = pkt_dev->stopped_at - pkt_dev->started_at;
+
+ idle = pkt_dev->idle_acc;
+
+ p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
+ (unsigned long long)total_us,
+ (unsigned long long)(total_us - idle),
+ (unsigned long long)idle,
+ (unsigned long long)pkt_dev->sofar,
+ pkt_dev->cur_pkt_size, nr_frags);
+
+ pps = pkt_dev->sofar * USEC_PER_SEC;
+
+ while ((total_us >> 32) != 0) {
+ pps >>= 1;
+ total_us >>= 1;
+ }
+
+ do_div(pps, total_us);
+
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+
+ mbps = bps;
+ do_div(mbps, 1000000);
+ p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu",
+ (unsigned long long)pps,
+ (unsigned long long)mbps,
+ (unsigned long long)bps,
+ (unsigned long long)pkt_dev->errors);
+}
+
+/* Set stopped-at timer, remove from running list, do counters & statistics */
+
+static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
+{
+ int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
+
+ if (!pkt_dev->running) {
+ printk(KERN_WARNING "pktgen: interface: %s is already "
+ "stopped\n", pkt_dev->odev->name);
+ return -EINVAL;
+ }
+
+ pkt_dev->stopped_at = getCurUs();
+ pkt_dev->running = 0;
+
+ show_results(pkt_dev, nr_frags);
+
+ return 0;
+}
+
+static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev, *best = NULL;
+
+ if_lock(t);
+
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ if (!pkt_dev->running)
+ continue;
+ if (best == NULL)
+ best = pkt_dev;
+ else if (pkt_dev->next_tx_us < best->next_tx_us)
+ best = pkt_dev;
+ }
+ if_unlock(t);
+ return best;
+}
+
+static void pktgen_stop(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+
+ pr_debug("pktgen: entering pktgen_stop\n");
+
+ if_lock(t);
+
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
+ pktgen_stop_device(pkt_dev);
+ if (pkt_dev->skb)
+ kfree_skb(pkt_dev->skb);
+
+ pkt_dev->skb = NULL;
+ }
+
+ if_unlock(t);
+}
+
+/*
+ * one of our devices needs to be removed - find it
+ * and remove it
+ */
+static void pktgen_rem_one_if(struct pktgen_thread *t)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ pr_debug("pktgen: entering pktgen_rem_one_if\n");
+
+ if_lock(t);
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ if (!cur->removal_mark)
+ continue;
+
+ if (cur->skb)
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
+
+ break;
+ }
+
+ if_unlock(t);
+}
+
+static void pktgen_rem_all_ifs(struct pktgen_thread *t)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ /* Remove all devices, free mem */
+
+ pr_debug("pktgen: entering pktgen_rem_all_ifs\n");
+ if_lock(t);
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ if (cur->skb)
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
+ }
+
+ if_unlock(t);
+}
+
+static void pktgen_rem_thread(struct pktgen_thread *t)
+{
+ /* Remove from the thread list */
+
+ remove_proc_entry(t->tsk->comm, pg_proc_dir);
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_del(&t->th_list);
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
+{
+ struct net_device *odev = NULL;
+ struct netdev_queue *txq;
+ __u64 idle_start = 0;
+ u16 queue_map;
+ int ret;
+
+ odev = pkt_dev->odev;
+
+ if (pkt_dev->delay_us || pkt_dev->delay_ns) {
+ u64 now;
+
+ now = getCurUs();
+ if (now < pkt_dev->next_tx_us)
+ spin(pkt_dev, pkt_dev->next_tx_us);
+
+ /* This is max DELAY, this has special meaning of
+ * "never transmit"
+ */
+ if (pkt_dev->delay_us == 0x7FFFFFFF) {
+ pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us;
+ pkt_dev->next_tx_ns = pkt_dev->delay_ns;
+ goto out;
+ }
+ }
+
+ if (!pkt_dev->skb) {
+ set_cur_queue_map(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
+ } else {
+ queue_map = skb_get_queue_mapping(pkt_dev->skb);
+ }
+
+ txq = netdev_get_tx_queue(odev, queue_map);
+ if (netif_tx_queue_stopped(txq) ||
+ netif_tx_queue_frozen(txq) ||
+ need_resched()) {
+ idle_start = getCurUs();
+
+ if (!netif_running(odev)) {
+ pktgen_stop_device(pkt_dev);
+ if (pkt_dev->skb)
+ kfree_skb(pkt_dev->skb);
+ pkt_dev->skb = NULL;
+ goto out;
+ }
+ if (need_resched())
+ schedule();
+
+ pkt_dev->idle_acc += getCurUs() - idle_start;
+
+ if (netif_tx_queue_stopped(txq) ||
+ netif_tx_queue_frozen(txq)) {
+ pkt_dev->next_tx_us = getCurUs(); /* TODO */
+ pkt_dev->next_tx_ns = 0;
+ goto out; /* Try the next interface */
+ }
+ }
+
+ if (pkt_dev->last_ok || !pkt_dev->skb) {
+ if ((++pkt_dev->clone_count >= pkt_dev->clone_skb)
+ || (!pkt_dev->skb)) {
+ /* build a new pkt */
+ if (pkt_dev->skb)
+ kfree_skb(pkt_dev->skb);
+
+ pkt_dev->skb = fill_packet(odev, pkt_dev);
+ if (pkt_dev->skb == NULL) {
+ printk(KERN_ERR "pktgen: ERROR: couldn't "
+ "allocate skb in fill_packet.\n");
+ schedule();
+ pkt_dev->clone_count--; /* back out increment, OOM */
+ goto out;
+ }
+ pkt_dev->allocated_skbs++;
+ pkt_dev->clone_count = 0; /* reset counter */
+ }
+ }
+
+ /* fill_packet() might have changed the queue */
+ queue_map = skb_get_queue_mapping(pkt_dev->skb);
+ txq = netdev_get_tx_queue(odev, queue_map);
+
+ __netif_tx_lock_bh(txq);
+ if (!netif_tx_queue_stopped(txq) &&
+ !netif_tx_queue_frozen(txq)) {
+
+ atomic_inc(&(pkt_dev->skb->users));
+ retry_now:
+ ret = odev->hard_start_xmit(pkt_dev->skb, odev);
+ if (likely(ret == NETDEV_TX_OK)) {
+ pkt_dev->last_ok = 1;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
+
+ } else if (ret == NETDEV_TX_LOCKED
+ && (odev->features & NETIF_F_LLTX)) {
+ cpu_relax();
+ goto retry_now;
+ } else { /* Retry it next time */
+
+ atomic_dec(&(pkt_dev->skb->users));
+
+ if (debug && net_ratelimit())
+ printk(KERN_INFO "pktgen: Hard xmit error\n");
+
+ pkt_dev->errors++;
+ pkt_dev->last_ok = 0;
+ }
+
+ pkt_dev->next_tx_us = getCurUs();
+ pkt_dev->next_tx_ns = 0;
+
+ pkt_dev->next_tx_us += pkt_dev->delay_us;
+ pkt_dev->next_tx_ns += pkt_dev->delay_ns;
+
+ if (pkt_dev->next_tx_ns > 1000) {
+ pkt_dev->next_tx_us++;
+ pkt_dev->next_tx_ns -= 1000;
+ }
+ }
+
+ else { /* Retry it next time */
+ pkt_dev->last_ok = 0;
+ pkt_dev->next_tx_us = getCurUs(); /* TODO */
+ pkt_dev->next_tx_ns = 0;
+ }
+
+ __netif_tx_unlock_bh(txq);
+
+ /* If pkt_dev->count is zero, then run forever */
+ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
+ if (atomic_read(&(pkt_dev->skb->users)) != 1) {
+ idle_start = getCurUs();
+ while (atomic_read(&(pkt_dev->skb->users)) != 1) {
+ if (signal_pending(current)) {
+ break;
+ }
+ schedule();
+ }
+ pkt_dev->idle_acc += getCurUs() - idle_start;
+ }
+
+ /* Done with this */
+ pktgen_stop_device(pkt_dev);
+ if (pkt_dev->skb)
+ kfree_skb(pkt_dev->skb);
+ pkt_dev->skb = NULL;
+ }
+out:;
+}
+
+/*
+ * Main loop of the thread goes here
+ */
+
+static int pktgen_thread_worker(void *arg)
+{
+ DEFINE_WAIT(wait);
+ struct pktgen_thread *t = arg;
+ struct pktgen_dev *pkt_dev = NULL;
+ int cpu = t->cpu;
+
+ BUG_ON(smp_processor_id() != cpu);
+
+ init_waitqueue_head(&t->queue);
+ complete(&t->start_done);
+
+ pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ pkt_dev = next_to_run(t);
+
+ if (!pkt_dev &&
+ (t->control & (T_STOP | T_RUN | T_REMDEVALL | T_REMDEV))
+ == 0) {
+ prepare_to_wait(&(t->queue), &wait,
+ TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ / 10);
+ finish_wait(&(t->queue), &wait);
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (pkt_dev)
+ pktgen_xmit(pkt_dev);
+
+ if (t->control & T_STOP) {
+ pktgen_stop(t);
+ t->control &= ~(T_STOP);
+ }
+
+ if (t->control & T_RUN) {
+ pktgen_run(t);
+ t->control &= ~(T_RUN);
+ }
+
+ if (t->control & T_REMDEVALL) {
+ pktgen_rem_all_ifs(t);
+ t->control &= ~(T_REMDEVALL);
+ }
+
+ if (t->control & T_REMDEV) {
+ pktgen_rem_one_if(t);
+ t->control &= ~(T_REMDEV);
+ }
+
+ try_to_freeze();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ pr_debug("pktgen: %s stopping all device\n", t->tsk->comm);
+ pktgen_stop(t);
+
+ pr_debug("pktgen: %s removing all device\n", t->tsk->comm);
+ pktgen_rem_all_ifs(t);
+
+ pr_debug("pktgen: %s removing thread.\n", t->tsk->comm);
+ pktgen_rem_thread(t);
+
+ return 0;
+}
+
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname)
+{
+ struct pktgen_dev *p, *pkt_dev = NULL;
+ if_lock(t);
+
+ list_for_each_entry(p, &t->if_list, list)
+ if (strncmp(p->odev->name, ifname, IFNAMSIZ) == 0) {
+ pkt_dev = p;
+ break;
+ }
+
+ if_unlock(t);
+ pr_debug("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev);
+ return pkt_dev;
+}
+
+/*
+ * Adds a dev at front of if_list.
+ */
+
+static int add_dev_to_thread(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ int rv = 0;
+
+ if_lock(t);
+
+ if (pkt_dev->pg_thread) {
+ printk(KERN_ERR "pktgen: ERROR: already assigned "
+ "to a thread.\n");
+ rv = -EBUSY;
+ goto out;
+ }
+
+ list_add(&pkt_dev->list, &t->if_list);
+ pkt_dev->pg_thread = t;
+ pkt_dev->running = 0;
+
+out:
+ if_unlock(t);
+ return rv;
+}
+
+/* Called under thread lock */
+
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
+{
+ struct pktgen_dev *pkt_dev;
+ int err;
+
+ /* We don't allow a device to be on several threads */
+
+ pkt_dev = __pktgen_NN_threads(ifname, FIND);
+ if (pkt_dev) {
+ printk(KERN_ERR "pktgen: ERROR: interface already used.\n");
+ return -EBUSY;
+ }
+
+ pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
+ if (!pkt_dev)
+ return -ENOMEM;
+
+ pkt_dev->flows = vmalloc(MAX_CFLOWS * sizeof(struct flow_state));
+ if (pkt_dev->flows == NULL) {
+ kfree(pkt_dev);
+ return -ENOMEM;
+ }
+ memset(pkt_dev->flows, 0, MAX_CFLOWS * sizeof(struct flow_state));
+
+ pkt_dev->removal_mark = 0;
+ pkt_dev->min_pkt_size = ETH_ZLEN;
+ pkt_dev->max_pkt_size = ETH_ZLEN;
+ pkt_dev->nfrags = 0;
+ pkt_dev->clone_skb = pg_clone_skb_d;
+ pkt_dev->delay_us = pg_delay_d / 1000;
+ pkt_dev->delay_ns = pg_delay_d % 1000;
+ pkt_dev->count = pg_count_d;
+ pkt_dev->sofar = 0;
+ pkt_dev->udp_src_min = 9; /* sink port */
+ pkt_dev->udp_src_max = 9;
+ pkt_dev->udp_dst_min = 9;
+ pkt_dev->udp_dst_max = 9;
+
+ pkt_dev->vlan_p = 0;
+ pkt_dev->vlan_cfi = 0;
+ pkt_dev->vlan_id = 0xffff;
+ pkt_dev->svlan_p = 0;
+ pkt_dev->svlan_cfi = 0;
+ pkt_dev->svlan_id = 0xffff;
+
+ err = pktgen_setup_dev(pkt_dev, ifname);
+ if (err)
+ goto out1;
+
+ pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir,
+ &pktgen_if_fops, pkt_dev);
+ if (!pkt_dev->entry) {
+ printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n",
+ PG_PROC_DIR, ifname);
+ err = -EINVAL;
+ goto out2;
+ }
+#ifdef CONFIG_XFRM
+ pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
+ pkt_dev->ipsproto = IPPROTO_ESP;
+#endif
+
+ return add_dev_to_thread(t, pkt_dev);
+out2:
+ dev_put(pkt_dev->odev);
+out1:
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ if (pkt_dev->flows)
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return err;
+}
+
+static int __init pktgen_create_thread(int cpu)
+{
+ struct pktgen_thread *t;
+ struct proc_dir_entry *pe;
+ struct task_struct *p;
+
+ t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL);
+ if (!t) {
+ printk(KERN_ERR "pktgen: ERROR: out of memory, can't "
+ "create new thread.\n");
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&t->if_lock);
+ t->cpu = cpu;
+
+ INIT_LIST_HEAD(&t->if_list);
+
+ list_add_tail(&t->th_list, &pktgen_threads);
+ init_completion(&t->start_done);
+
+ p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu);
+ if (IS_ERR(p)) {
+ printk(KERN_ERR "pktgen: kernel_thread() failed "
+ "for cpu %d\n", t->cpu);
+ list_del(&t->th_list);
+ kfree(t);
+ return PTR_ERR(p);
+ }
+ kthread_bind(p, cpu);
+ t->tsk = p;
+
+ pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir,
+ &pktgen_thread_fops, t);
+ if (!pe) {
+ printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n",
+ PG_PROC_DIR, t->tsk->comm);
+ kthread_stop(p);
+ list_del(&t->th_list);
+ kfree(t);
+ return -EINVAL;
+ }
+
+ wake_up_process(p);
+ wait_for_completion(&t->start_done);
+
+ return 0;
+}
+
+/*
+ * Removes a device from the thread if_list.
+ */
+static void _rem_dev_from_if_list(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *p;
+
+ list_for_each_safe(q, n, &t->if_list) {
+ p = list_entry(q, struct pktgen_dev, list);
+ if (p == pkt_dev)
+ list_del(&p->list);
+ }
+}
+
+static int pktgen_remove_device(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+
+ pr_debug("pktgen: remove_device pkt_dev=%p\n", pkt_dev);
+
+ if (pkt_dev->running) {
+ printk(KERN_WARNING "pktgen: WARNING: trying to remove a "
+ "running interface, stopping it now.\n");
+ pktgen_stop_device(pkt_dev);
+ }
+
+ /* Dis-associate from the interface */
+
+ if (pkt_dev->odev) {
+ dev_put(pkt_dev->odev);
+ pkt_dev->odev = NULL;
+ }
+
+ /* And update the thread if_list */
+
+ _rem_dev_from_if_list(t, pkt_dev);
+
+ if (pkt_dev->entry)
+ remove_proc_entry(pkt_dev->entry->name, pg_proc_dir);
+
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ if (pkt_dev->flows)
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return 0;
+}
+
+static int __init pg_init(void)
+{
+ int cpu;
+ struct proc_dir_entry *pe;
+
+ printk(KERN_INFO "%s", version);
+
+ pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net);
+ if (!pg_proc_dir)
+ return -ENODEV;
+ pg_proc_dir->owner = THIS_MODULE;
+
+ pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
+ if (pe == NULL) {
+ printk(KERN_ERR "pktgen: ERROR: cannot create %s "
+ "procfs entry.\n", PGCTRL);
+ proc_net_remove(&init_net, PG_PROC_DIR);
+ return -EINVAL;
+ }
+
+ /* Register us to receive netdevice events */
+ register_netdevice_notifier(&pktgen_notifier_block);
+
+ for_each_online_cpu(cpu) {
+ int err;
+
+ err = pktgen_create_thread(cpu);
+ if (err)
+ printk(KERN_WARNING "pktgen: WARNING: Cannot create "
+ "thread for cpu %d (%d)\n", cpu, err);
+ }
+
+ if (list_empty(&pktgen_threads)) {
+ printk(KERN_ERR "pktgen: ERROR: Initialization failed for "
+ "all threads\n");
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+ remove_proc_entry(PGCTRL, pg_proc_dir);
+ proc_net_remove(&init_net, PG_PROC_DIR);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void __exit pg_cleanup(void)
+{
+ struct pktgen_thread *t;
+ struct list_head *q, *n;
+ wait_queue_head_t queue;
+ init_waitqueue_head(&queue);
+
+ /* Stop all interfaces & threads */
+
+ list_for_each_safe(q, n, &pktgen_threads) {
+ t = list_entry(q, struct pktgen_thread, th_list);
+ kthread_stop(t->tsk);
+ kfree(t);
+ }
+
+ /* Un-register us from receiving netdevice events */
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+
+ /* Clean up proc file system */
+ remove_proc_entry(PGCTRL, pg_proc_dir);
+ proc_net_remove(&init_net, PG_PROC_DIR);
+}
+
+module_init(pg_init);
+module_exit(pg_cleanup);
+
+MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se");
+MODULE_DESCRIPTION("Packet Generator tool");
+MODULE_LICENSE("GPL");
+module_param(pg_count_d, int, 0);
+module_param(pg_delay_d, int, 0);
+module_param(pg_clone_skb_d, int, 0);
+module_param(debug, int, 0);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 0000000..7552495
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,132 @@
+/*
+ * NET Generic infrastructure for Network protocols.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * From code originally in include/net/tcp.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <net/request_sock.h>
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Note : Dont forget somaxconn that may limit backlog too.
+ */
+int sysctl_max_syn_backlog = 256;
+
+int reqsk_queue_alloc(struct request_sock_queue *queue,
+ unsigned int nr_table_entries)
+{
+ size_t lopt_size = sizeof(struct listen_sock);
+ struct listen_sock *lopt;
+
+ nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
+ nr_table_entries = max_t(u32, nr_table_entries, 8);
+ nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+ lopt_size += nr_table_entries * sizeof(struct request_sock *);
+ if (lopt_size > PAGE_SIZE)
+ lopt = __vmalloc(lopt_size,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ else
+ lopt = kzalloc(lopt_size, GFP_KERNEL);
+ if (lopt == NULL)
+ return -ENOMEM;
+
+ for (lopt->max_qlen_log = 3;
+ (1 << lopt->max_qlen_log) < nr_table_entries;
+ lopt->max_qlen_log++);
+
+ get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
+ rwlock_init(&queue->syn_wait_lock);
+ queue->rskq_accept_head = NULL;
+ lopt->nr_table_entries = nr_table_entries;
+
+ write_lock_bh(&queue->syn_wait_lock);
+ queue->listen_opt = lopt;
+ write_unlock_bh(&queue->syn_wait_lock);
+
+ return 0;
+}
+
+void __reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ struct listen_sock *lopt;
+ size_t lopt_size;
+
+ /*
+ * this is an error recovery path only
+ * no locking needed and the lopt is not NULL
+ */
+
+ lopt = queue->listen_opt;
+ lopt_size = sizeof(struct listen_sock) +
+ lopt->nr_table_entries * sizeof(struct request_sock *);
+
+ if (lopt_size > PAGE_SIZE)
+ vfree(lopt);
+ else
+ kfree(lopt);
+}
+
+static inline struct listen_sock *reqsk_queue_yank_listen_sk(
+ struct request_sock_queue *queue)
+{
+ struct listen_sock *lopt;
+
+ write_lock_bh(&queue->syn_wait_lock);
+ lopt = queue->listen_opt;
+ queue->listen_opt = NULL;
+ write_unlock_bh(&queue->syn_wait_lock);
+
+ return lopt;
+}
+
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ /* make all the listen_opt local to us */
+ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+ size_t lopt_size = sizeof(struct listen_sock) +
+ lopt->nr_table_entries * sizeof(struct request_sock *);
+
+ if (lopt->qlen != 0) {
+ unsigned int i;
+
+ for (i = 0; i < lopt->nr_table_entries; i++) {
+ struct request_sock *req;
+
+ while ((req = lopt->syn_table[i]) != NULL) {
+ lopt->syn_table[i] = req->dl_next;
+ lopt->qlen--;
+ reqsk_free(req);
+ }
+ }
+ }
+
+ WARN_ON(lopt->qlen != 0);
+ if (lopt_size > PAGE_SIZE)
+ vfree(lopt);
+ else
+ kfree(lopt);
+}
+
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
new file mode 100644
index 0000000..4dfb6b4
--- /dev/null
+++ b/net/core/rtnetlink.c
@@ -0,0 +1,1429 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Routing netlink socket interface: protocol independent part.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/if_addr.h>
+#include <linux/nsproxy.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/string.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/fib_rules.h>
+#include <net/rtnetlink.h>
+
+struct rtnl_link
+{
+ rtnl_doit_func doit;
+ rtnl_dumpit_func dumpit;
+};
+
+static DEFINE_MUTEX(rtnl_mutex);
+
+void rtnl_lock(void)
+{
+ mutex_lock(&rtnl_mutex);
+}
+
+void __rtnl_unlock(void)
+{
+ mutex_unlock(&rtnl_mutex);
+}
+
+void rtnl_unlock(void)
+{
+ /* This fellow will unlock it for us. */
+ netdev_run_todo();
+}
+
+int rtnl_trylock(void)
+{
+ return mutex_trylock(&rtnl_mutex);
+}
+
+int rtnl_is_locked(void)
+{
+ return mutex_is_locked(&rtnl_mutex);
+}
+
+static struct rtnl_link *rtnl_msg_handlers[NPROTO];
+
+static inline int rtm_msgindex(int msgtype)
+{
+ int msgindex = msgtype - RTM_BASE;
+
+ /*
+ * msgindex < 0 implies someone tried to register a netlink
+ * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
+ * the message type has not been added to linux/rtnetlink.h
+ */
+ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);
+
+ return msgindex;
+}
+
+static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ tab = rtnl_msg_handlers[protocol];
+ if (tab == NULL || tab[msgindex].doit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab ? tab[msgindex].doit : NULL;
+}
+
+static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ tab = rtnl_msg_handlers[protocol];
+ if (tab == NULL || tab[msgindex].dumpit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab ? tab[msgindex].dumpit : NULL;
+}
+
+/**
+ * __rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+{
+ struct rtnl_link *tab;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol >= NPROTO);
+ msgindex = rtm_msgindex(msgtype);
+
+ tab = rtnl_msg_handlers[protocol];
+ if (tab == NULL) {
+ tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
+ if (tab == NULL)
+ return -ENOBUFS;
+
+ rtnl_msg_handlers[protocol] = tab;
+ }
+
+ if (doit)
+ tab[msgindex].doit = doit;
+
+ if (dumpit)
+ tab[msgindex].dumpit = dumpit;
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(__rtnl_register);
+
+/**
+ * rtnl_register - Register a rtnetlink message type
+ *
+ * Identical to __rtnl_register() but panics on failure. This is useful
+ * as failure of this function is very unlikely, it can only happen due
+ * to lack of memory when allocating the chain to store all message
+ * handlers for a protocol. Meant for use in init functions where lack
+ * of memory implies no sense in continueing.
+ */
+void rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+{
+ if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0)
+ panic("Unable to register rtnetlink message handler, "
+ "protocol = %d, message type = %d\n",
+ protocol, msgtype);
+}
+
+EXPORT_SYMBOL_GPL(rtnl_register);
+
+/**
+ * rtnl_unregister - Unregister a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_unregister(int protocol, int msgtype)
+{
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol >= NPROTO);
+ msgindex = rtm_msgindex(msgtype);
+
+ if (rtnl_msg_handlers[protocol] == NULL)
+ return -ENOENT;
+
+ rtnl_msg_handlers[protocol][msgindex].doit = NULL;
+ rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(rtnl_unregister);
+
+/**
+ * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
+ * @protocol : Protocol family or PF_UNSPEC
+ *
+ * Identical to calling rtnl_unregster() for all registered message types
+ * of a certain protocol family.
+ */
+void rtnl_unregister_all(int protocol)
+{
+ BUG_ON(protocol < 0 || protocol >= NPROTO);
+
+ kfree(rtnl_msg_handlers[protocol]);
+ rtnl_msg_handlers[protocol] = NULL;
+}
+
+EXPORT_SYMBOL_GPL(rtnl_unregister_all);
+
+static LIST_HEAD(link_ops);
+
+/**
+ * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that create devices during module initialization. It
+ * must be called before registering the devices.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ if (!ops->dellink)
+ ops->dellink = unregister_netdevice;
+
+ list_add_tail(&ops->list, &link_ops);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(__rtnl_link_register);
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ int err;
+
+ rtnl_lock();
+ err = __rtnl_link_register(ops);
+ rtnl_unlock();
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+ struct net_device *dev;
+restart:
+ for_each_netdev(net, dev) {
+ if (dev->rtnl_link_ops == ops) {
+ ops->dellink(dev);
+ goto restart;
+ }
+ }
+}
+
+void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+ rtnl_lock();
+ __rtnl_kill_links(net, ops);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_kill_links);
+
+/**
+ * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ struct net *net;
+
+ for_each_net(net) {
+ __rtnl_kill_links(net, ops);
+ }
+ list_del(&ops->list);
+}
+
+EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ rtnl_lock();
+ __rtnl_link_unregister(ops);
+ rtnl_unlock();
+}
+
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
+
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
+static size_t rtnl_link_get_size(const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ size_t size;
+
+ if (!ops)
+ return 0;
+
+ size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+ nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */
+
+ if (ops->get_size)
+ /* IFLA_INFO_DATA + nested data */
+ size += nlmsg_total_size(sizeof(struct nlattr)) +
+ ops->get_size(dev);
+
+ if (ops->get_xstats_size)
+ size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */
+
+ return size;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ struct nlattr *linkinfo, *data;
+ int err = -EMSGSIZE;
+
+ linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+ if (linkinfo == NULL)
+ goto out;
+
+ if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+ goto err_cancel_link;
+ if (ops->fill_xstats) {
+ err = ops->fill_xstats(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+ }
+ if (ops->fill_info) {
+ data = nla_nest_start(skb, IFLA_INFO_DATA);
+ if (data == NULL)
+ goto err_cancel_link;
+ err = ops->fill_info(skb, dev);
+ if (err < 0)
+ goto err_cancel_data;
+ nla_nest_end(skb, data);
+ }
+
+ nla_nest_end(skb, linkinfo);
+ return 0;
+
+err_cancel_data:
+ nla_nest_cancel(skb, data);
+err_cancel_link:
+ nla_nest_cancel(skb, linkinfo);
+out:
+ return err;
+}
+
+static const int rtm_min[RTM_NR_FAMILIES] =
+{
+ [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+ [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
+ [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)),
+ [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
+ [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
+ [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
+ [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
+ [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)),
+ [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+ [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+};
+
+static const int rta_max[RTM_NR_FAMILIES] =
+{
+ [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX,
+ [RTM_FAM(RTM_NEWADDR)] = IFA_MAX,
+ [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX,
+ [RTM_FAM(RTM_NEWRULE)] = FRA_MAX,
+ [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX,
+ [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
+ [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
+ [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
+};
+
+void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
+{
+ struct rtattr *rta;
+ int size = RTA_LENGTH(attrlen);
+
+ rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
+ rta->rta_type = attrtype;
+ rta->rta_len = size;
+ memcpy(RTA_DATA(rta), data, attrlen);
+ memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
+}
+
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
+{
+ struct sock *rtnl = net->rtnl;
+ int err = 0;
+
+ NETLINK_CB(skb).dst_group = group;
+ if (echo)
+ atomic_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
+ if (echo)
+ err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+ return err;
+}
+
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{
+ struct sock *rtnl = net->rtnl;
+
+ return nlmsg_unicast(rtnl, skb, pid);
+}
+
+int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
+{
+ struct sock *rtnl = net->rtnl;
+ int report = 0;
+
+ if (nlh)
+ report = nlmsg_report(nlh);
+
+ return nlmsg_notify(rtnl, skb, pid, group, report, flags);
+}
+
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+ struct sock *rtnl = net->rtnl;
+
+ netlink_set_err(rtnl, 0, group, error);
+}
+
+int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
+{
+ struct nlattr *mx;
+ int i, valid = 0;
+
+ mx = nla_nest_start(skb, RTA_METRICS);
+ if (mx == NULL)
+ return -ENOBUFS;
+
+ for (i = 0; i < RTAX_MAX; i++) {
+ if (metrics[i]) {
+ valid++;
+ NLA_PUT_U32(skb, i+1, metrics[i]);
+ }
+ }
+
+ if (!valid) {
+ nla_nest_cancel(skb, mx);
+ return 0;
+ }
+
+ return nla_nest_end(skb, mx);
+
+nla_put_failure:
+ nla_nest_cancel(skb, mx);
+ return -EMSGSIZE;
+}
+
+int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
+ u32 ts, u32 tsage, long expires, u32 error)
+{
+ struct rta_cacheinfo ci = {
+ .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
+ .rta_used = dst->__use,
+ .rta_clntref = atomic_read(&(dst->__refcnt)),
+ .rta_error = error,
+ .rta_id = id,
+ .rta_ts = ts,
+ .rta_tsage = tsage,
+ };
+
+ if (expires)
+ ci.rta_expires = jiffies_to_clock_t(expires);
+
+ return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+}
+
+EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+
+static void set_operstate(struct net_device *dev, unsigned char transition)
+{
+ unsigned char operstate = dev->operstate;
+
+ switch(transition) {
+ case IF_OPER_UP:
+ if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_UNKNOWN) &&
+ !netif_dormant(dev))
+ operstate = IF_OPER_UP;
+ break;
+
+ case IF_OPER_DORMANT:
+ if (operstate == IF_OPER_UP ||
+ operstate == IF_OPER_UNKNOWN)
+ operstate = IF_OPER_DORMANT;
+ break;
+ }
+
+ if (dev->operstate != operstate) {
+ write_lock_bh(&dev_base_lock);
+ dev->operstate = operstate;
+ write_unlock_bh(&dev_base_lock);
+ netdev_state_change(dev);
+ }
+}
+
+static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
+ struct net_device_stats *b)
+{
+ a->rx_packets = b->rx_packets;
+ a->tx_packets = b->tx_packets;
+ a->rx_bytes = b->rx_bytes;
+ a->tx_bytes = b->tx_bytes;
+ a->rx_errors = b->rx_errors;
+ a->tx_errors = b->tx_errors;
+ a->rx_dropped = b->rx_dropped;
+ a->tx_dropped = b->tx_dropped;
+
+ a->multicast = b->multicast;
+ a->collisions = b->collisions;
+
+ a->rx_length_errors = b->rx_length_errors;
+ a->rx_over_errors = b->rx_over_errors;
+ a->rx_crc_errors = b->rx_crc_errors;
+ a->rx_frame_errors = b->rx_frame_errors;
+ a->rx_fifo_errors = b->rx_fifo_errors;
+ a->rx_missed_errors = b->rx_missed_errors;
+
+ a->tx_aborted_errors = b->tx_aborted_errors;
+ a->tx_carrier_errors = b->tx_carrier_errors;
+ a->tx_fifo_errors = b->tx_fifo_errors;
+ a->tx_heartbeat_errors = b->tx_heartbeat_errors;
+ a->tx_window_errors = b->tx_window_errors;
+
+ a->rx_compressed = b->rx_compressed;
+ a->tx_compressed = b->tx_compressed;
+};
+
+static inline size_t if_nlmsg_size(const struct net_device *dev)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ + nla_total_size(sizeof(struct rtnl_link_ifmap))
+ + nla_total_size(sizeof(struct rtnl_link_stats))
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ + nla_total_size(4) /* IFLA_TXQLEN */
+ + nla_total_size(4) /* IFLA_WEIGHT */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(1) /* IFLA_LINKMODE */
+ + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
+}
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags)
+{
+ struct netdev_queue *txq;
+ struct ifinfomsg *ifm;
+ struct nlmsghdr *nlh;
+ struct net_device_stats *stats;
+ struct nlattr *attr;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = change;
+
+ NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+ NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len);
+ NLA_PUT_U8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
+ NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
+ NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+
+ if (dev->ifindex != dev->iflink)
+ NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+
+ if (dev->master)
+ NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex);
+
+ txq = netdev_get_tx_queue(dev, 0);
+ if (txq->qdisc_sleeping)
+ NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id);
+
+ if (dev->ifalias)
+ NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias);
+
+ if (1) {
+ struct rtnl_link_ifmap map = {
+ .mem_start = dev->mem_start,
+ .mem_end = dev->mem_end,
+ .base_addr = dev->base_addr,
+ .irq = dev->irq,
+ .dma = dev->dma,
+ .port = dev->if_port,
+ };
+ NLA_PUT(skb, IFLA_MAP, sizeof(map), &map);
+ }
+
+ if (dev->addr_len) {
+ NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+ NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
+ }
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (attr == NULL)
+ goto nla_put_failure;
+
+ stats = dev->get_stats(dev);
+ copy_rtnl_link_stats(nla_data(attr), stats);
+
+ if (dev->rtnl_link_ops) {
+ if (rtnl_link_fill(skb, dev) < 0)
+ goto nla_put_failure;
+ }
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int idx;
+ int s_idx = cb->args[0];
+ struct net_device *dev;
+
+ idx = 0;
+ for_each_netdev(net, dev) {
+ if (idx < s_idx)
+ goto cont;
+ if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0)
+ break;
+cont:
+ idx++;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+ [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
+ [IFLA_MTU] = { .type = NLA_U32 },
+ [IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_TXQLEN] = { .type = NLA_U32 },
+ [IFLA_WEIGHT] = { .type = NLA_U32 },
+ [IFLA_OPERSTATE] = { .type = NLA_U8 },
+ [IFLA_LINKMODE] = { .type = NLA_U8 },
+ [IFLA_LINKINFO] = { .type = NLA_NESTED },
+ [IFLA_NET_NS_PID] = { .type = NLA_U32 },
+ [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
+};
+
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+ [IFLA_INFO_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_DATA] = { .type = NLA_NESTED },
+};
+
+static struct net *get_net_ns_by_pid(pid_t pid)
+{
+ struct task_struct *tsk;
+ struct net *net;
+
+ /* Lookup the network namespace */
+ net = ERR_PTR(-ESRCH);
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk) {
+ struct nsproxy *nsproxy;
+ nsproxy = task_nsproxy(tsk);
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ }
+ rcu_read_unlock();
+ return net;
+}
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+{
+ if (dev) {
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+ struct nlattr **tb, char *ifname, int modified)
+{
+ int send_addr_notify = 0;
+ int err;
+
+ if (tb[IFLA_NET_NS_PID]) {
+ struct net *net;
+ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ if (IS_ERR(net)) {
+ err = PTR_ERR(net);
+ goto errout;
+ }
+ err = dev_change_net_namespace(dev, net, ifname);
+ put_net(net);
+ if (err)
+ goto errout;
+ modified = 1;
+ }
+
+ if (tb[IFLA_MAP]) {
+ struct rtnl_link_ifmap *u_map;
+ struct ifmap k_map;
+
+ if (!dev->set_config) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!netif_device_present(dev)) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ u_map = nla_data(tb[IFLA_MAP]);
+ k_map.mem_start = (unsigned long) u_map->mem_start;
+ k_map.mem_end = (unsigned long) u_map->mem_end;
+ k_map.base_addr = (unsigned short) u_map->base_addr;
+ k_map.irq = (unsigned char) u_map->irq;
+ k_map.dma = (unsigned char) u_map->dma;
+ k_map.port = (unsigned char) u_map->port;
+
+ err = dev->set_config(dev, &k_map);
+ if (err < 0)
+ goto errout;
+
+ modified = 1;
+ }
+
+ if (tb[IFLA_ADDRESS]) {
+ struct sockaddr *sa;
+ int len;
+
+ if (!dev->set_mac_address) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!netif_device_present(dev)) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ len = sizeof(sa_family_t) + dev->addr_len;
+ sa = kmalloc(len, GFP_KERNEL);
+ if (!sa) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ sa->sa_family = dev->type;
+ memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
+ dev->addr_len);
+ err = dev->set_mac_address(dev, sa);
+ kfree(sa);
+ if (err)
+ goto errout;
+ send_addr_notify = 1;
+ modified = 1;
+ }
+
+ if (tb[IFLA_MTU]) {
+ err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+
+ /*
+ * Interface selected by interface index but interface
+ * name provided implies that a name change has been
+ * requested.
+ */
+ if (ifm->ifi_index > 0 && ifname[0]) {
+ err = dev_change_name(dev, ifname);
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+
+ if (tb[IFLA_IFALIAS]) {
+ err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
+ if (err < 0)
+ goto errout;
+ modified = 1;
+ }
+
+ if (tb[IFLA_BROADCAST]) {
+ nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
+ send_addr_notify = 1;
+ }
+
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ unsigned int flags = ifm->ifi_flags;
+
+ /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+ if (ifm->ifi_change)
+ flags = (flags & ifm->ifi_change) |
+ (dev->flags & ~ifm->ifi_change);
+ err = dev_change_flags(dev, flags);
+ if (err < 0)
+ goto errout;
+ }
+
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+
+ if (tb[IFLA_LINKMODE]) {
+ write_lock_bh(&dev_base_lock);
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ write_unlock_bh(&dev_base_lock);
+ }
+
+ err = 0;
+
+errout:
+ if (err < 0 && modified && net_ratelimit())
+ printk(KERN_WARNING "A link change request failed with "
+ "some changes comitted already. Interface %s may "
+ "have been left with an inconsistent configuration, "
+ "please check.\n", dev->name);
+
+ if (send_addr_notify)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ int err;
+ struct nlattr *tb[IFLA_MAX+1];
+ char ifname[IFNAMSIZ];
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = dev_get_by_name(net, ifname);
+ else
+ goto errout;
+
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ if ((err = validate_linkmsg(dev, tb)) < 0)
+ goto errout_dev;
+
+ err = do_setlink(dev, ifm, tb, ifname, 0);
+errout_dev:
+ dev_put(dev);
+errout:
+ return err;
+}
+
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->rtnl_link_ops;
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev);
+ return 0;
+}
+
+struct net_device *rtnl_create_link(struct net *net, char *ifname,
+ const struct rtnl_link_ops *ops, struct nlattr *tb[])
+{
+ int err;
+ struct net_device *dev;
+
+ err = -ENOMEM;
+ dev = alloc_netdev(ops->priv_size, ifname, ops->setup);
+ if (!dev)
+ goto err;
+
+ if (strchr(dev->name, '%')) {
+ err = dev_alloc_name(dev, dev->name);
+ if (err < 0)
+ goto err_free;
+ }
+
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = ops;
+
+ if (tb[IFLA_MTU])
+ dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+ if (tb[IFLA_ADDRESS])
+ memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
+ if (tb[IFLA_BROADCAST])
+ memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
+ nla_len(tb[IFLA_BROADCAST]));
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+ if (tb[IFLA_LINKMODE])
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+
+ return dev;
+
+err_free:
+ free_netdev(dev);
+err:
+ return ERR_PTR(err);
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ char kind[MODULE_NAME_LEN];
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+ int err;
+
+#ifdef CONFIG_MODULES
+replay:
+#endif
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+
+ if ((err = validate_linkmsg(dev, tb)) < 0)
+ return err;
+
+ if (tb[IFLA_LINKINFO]) {
+ err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
+ tb[IFLA_LINKINFO], ifla_info_policy);
+ if (err < 0)
+ return err;
+ } else
+ memset(linkinfo, 0, sizeof(linkinfo));
+
+ if (linkinfo[IFLA_INFO_KIND]) {
+ nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ } else {
+ kind[0] = '\0';
+ ops = NULL;
+ }
+
+ if (1) {
+ struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL;
+
+ if (ops) {
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ err = nla_parse_nested(attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy);
+ if (err < 0)
+ return err;
+ data = attr;
+ }
+ if (ops->validate) {
+ err = ops->validate(tb, data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (dev) {
+ int modified = 0;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops ||
+ !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data);
+ if (err < 0)
+ return err;
+ modified = 1;
+ }
+
+ return do_setlink(dev, ifm, tb, ifname, modified);
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -ENODEV;
+
+ if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change)
+ return -EOPNOTSUPP;
+ if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+#ifdef CONFIG_MODULES
+ if (kind[0]) {
+ __rtnl_unlock();
+ request_module("rtnl-link-%s", kind);
+ rtnl_lock();
+ ops = rtnl_link_ops_get(kind);
+ if (ops)
+ goto replay;
+ }
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ if (!ifname[0])
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+
+ dev = rtnl_create_link(net, ifname, ops, tb);
+
+ if (IS_ERR(dev))
+ err = PTR_ERR(dev);
+ else if (ops->newlink)
+ err = ops->newlink(dev, tb, data);
+ else
+ err = register_netdevice(dev);
+
+ if (err < 0 && !IS_ERR(dev))
+ free_netdev(dev);
+ return err;
+ }
+}
+
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct sk_buff *nskb;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0) {
+ dev = dev_get_by_index(net, ifm->ifi_index);
+ if (dev == NULL)
+ return -ENODEV;
+ } else
+ return -EINVAL;
+
+ nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+ if (nskb == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
+ nlh->nlmsg_seq, 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ goto errout;
+ }
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
+errout:
+ dev_put(dev);
+
+ return err;
+}
+
+static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->family;
+
+ if (s_idx == 0)
+ s_idx = 1;
+ for (idx=1; idx<NPROTO; idx++) {
+ int type = cb->nlh->nlmsg_type-RTM_BASE;
+ if (idx < s_idx || idx == PF_PACKET)
+ continue;
+ if (rtnl_msg_handlers[idx] == NULL ||
+ rtnl_msg_handlers[idx][type].dumpit == NULL)
+ continue;
+ if (idx > s_idx)
+ memset(&cb->args[0], 0, sizeof(cb->args));
+ if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
+ break;
+ }
+ cb->family = idx;
+
+ return skb->len;
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+
+/* Protected by RTNL sempahore. */
+static struct rtattr **rta_buf;
+static int rtattr_max;
+
+/* Process one rtnetlink message. */
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ rtnl_doit_func doit;
+ int sz_idx, kind;
+ int min_len;
+ int family;
+ int type;
+ int err;
+
+ type = nlh->nlmsg_type;
+ if (type > RTM_MAX)
+ return -EOPNOTSUPP;
+
+ type -= RTM_BASE;
+
+ /* All the messages must have at least 1 byte length */
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
+ return 0;
+
+ family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
+ if (family >= NPROTO)
+ return -EAFNOSUPPORT;
+
+ sz_idx = type>>2;
+ kind = type&3;
+
+ if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ struct sock *rtnl;
+ rtnl_dumpit_func dumpit;
+
+ dumpit = rtnl_get_dumpit(family, type);
+ if (dumpit == NULL)
+ return -EOPNOTSUPP;
+
+ __rtnl_unlock();
+ rtnl = net->rtnl;
+ err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
+ rtnl_lock();
+ return err;
+ }
+
+ memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
+
+ min_len = rtm_min[sz_idx];
+ if (nlh->nlmsg_len < min_len)
+ return -EINVAL;
+
+ if (nlh->nlmsg_len > min_len) {
+ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+ struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len);
+
+ while (RTA_OK(attr, attrlen)) {
+ unsigned flavor = attr->rta_type;
+ if (flavor) {
+ if (flavor > rta_max[sz_idx])
+ return -EINVAL;
+ rta_buf[flavor-1] = attr;
+ }
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ }
+
+ doit = rtnl_get_doit(family, type);
+ if (doit == NULL)
+ return -EOPNOTSUPP;
+
+ return doit(skb, nlh, (void *)&rta_buf[0]);
+}
+
+static void rtnetlink_rcv(struct sk_buff *skb)
+{
+ rtnl_lock();
+ netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+ rtnl_unlock();
+}
+
+static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
+ break;
+ case NETDEV_REGISTER:
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+ break;
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+ break;
+ case NETDEV_CHANGE:
+ case NETDEV_GOING_DOWN:
+ break;
+ default:
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rtnetlink_dev_notifier = {
+ .notifier_call = rtnetlink_event,
+};
+
+
+static int rtnetlink_net_init(struct net *net)
+{
+ struct sock *sk;
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
+ rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+ if (!sk)
+ return -ENOMEM;
+ net->rtnl = sk;
+ return 0;
+}
+
+static void rtnetlink_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->rtnl);
+ net->rtnl = NULL;
+}
+
+static struct pernet_operations rtnetlink_net_ops = {
+ .init = rtnetlink_net_init,
+ .exit = rtnetlink_net_exit,
+};
+
+void __init rtnetlink_init(void)
+{
+ int i;
+
+ rtattr_max = 0;
+ for (i = 0; i < ARRAY_SIZE(rta_max); i++)
+ if (rta_max[i] > rtattr_max)
+ rtattr_max = rta_max[i];
+ rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
+ if (!rta_buf)
+ panic("rtnetlink_init: cannot allocate rta_buf\n");
+
+ if (register_pernet_subsys(&rtnetlink_net_ops))
+ panic("rtnetlink_init: cannot initialize rtnetlink\n");
+
+ netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
+ register_netdevice_notifier(&rtnetlink_dev_notifier);
+
+ rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo);
+ rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all);
+ rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all);
+}
+
+EXPORT_SYMBOL(__rta_fill);
+EXPORT_SYMBOL(rtnetlink_put_metrics);
+EXPORT_SYMBOL(rtnl_lock);
+EXPORT_SYMBOL(rtnl_trylock);
+EXPORT_SYMBOL(rtnl_unlock);
+EXPORT_SYMBOL(rtnl_is_locked);
+EXPORT_SYMBOL(rtnl_unicast);
+EXPORT_SYMBOL(rtnl_notify);
+EXPORT_SYMBOL(rtnl_set_sk_err);
+EXPORT_SYMBOL(rtnl_create_link);
+EXPORT_SYMBOL(ifla_policy);
diff --git a/net/core/scm.c b/net/core/scm.c
new file mode 100644
index 0000000..b12303d
--- /dev/null
+++ b/net/core/scm.c
@@ -0,0 +1,314 @@
+/* scm.c - Socket level control messages processing.
+ *
+ * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Alignment and value checking mods by Craig Metz
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/compat.h>
+#include <net/scm.h>
+
+
+/*
+ * Only allow a user to send credentials, that they could set with
+ * setu(g)id.
+ */
+
+static __inline__ int scm_check_creds(struct ucred *creds)
+{
+ if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
+ ((creds->uid == current->uid || creds->uid == current->euid ||
+ creds->uid == current->suid) || capable(CAP_SETUID)) &&
+ ((creds->gid == current->gid || creds->gid == current->egid ||
+ creds->gid == current->sgid) || capable(CAP_SETGID))) {
+ return 0;
+ }
+ return -EPERM;
+}
+
+static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
+{
+ int *fdp = (int*)CMSG_DATA(cmsg);
+ struct scm_fp_list *fpl = *fplp;
+ struct file **fpp;
+ int i, num;
+
+ num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+
+ if (num <= 0)
+ return 0;
+
+ if (num > SCM_MAX_FD)
+ return -EINVAL;
+
+ if (!fpl)
+ {
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+ *fplp = fpl;
+ fpl->count = 0;
+ }
+ fpp = &fpl->fp[fpl->count];
+
+ if (fpl->count + num > SCM_MAX_FD)
+ return -EINVAL;
+
+ /*
+ * Verify the descriptors and increment the usage count.
+ */
+
+ for (i=0; i< num; i++)
+ {
+ int fd = fdp[i];
+ struct file *file;
+
+ if (fd < 0 || !(file = fget(fd)))
+ return -EBADF;
+ *fpp++ = file;
+ fpl->count++;
+ }
+ return num;
+}
+
+void __scm_destroy(struct scm_cookie *scm)
+{
+ struct scm_fp_list *fpl = scm->fp;
+ int i;
+
+ if (fpl) {
+ scm->fp = NULL;
+ if (current->scm_work_list) {
+ list_add_tail(&fpl->list, current->scm_work_list);
+ } else {
+ LIST_HEAD(work_list);
+
+ current->scm_work_list = &work_list;
+
+ list_add(&fpl->list, &work_list);
+ while (!list_empty(&work_list)) {
+ fpl = list_first_entry(&work_list, struct scm_fp_list, list);
+
+ list_del(&fpl->list);
+ for (i=fpl->count-1; i>=0; i--)
+ fput(fpl->fp[i]);
+ kfree(fpl);
+ }
+
+ current->scm_work_list = NULL;
+ }
+ }
+}
+
+int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
+{
+ struct cmsghdr *cmsg;
+ int err;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
+ {
+ err = -EINVAL;
+
+ /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
+ /* The first check was omitted in <= 2.2.5. The reasoning was
+ that parser checks cmsg_len in any case, so that
+ additional check would be work duplication.
+ But if cmsg_level is not SOL_SOCKET, we do not check
+ for too short ancillary data object at all! Oops.
+ OK, let's add it...
+ */
+ if (!CMSG_OK(msg, cmsg))
+ goto error;
+
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+
+ switch (cmsg->cmsg_type)
+ {
+ case SCM_RIGHTS:
+ err=scm_fp_copy(cmsg, &p->fp);
+ if (err<0)
+ goto error;
+ break;
+ case SCM_CREDENTIALS:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
+ goto error;
+ memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&p->creds);
+ if (err)
+ goto error;
+ break;
+ default:
+ goto error;
+ }
+ }
+
+ if (p->fp && !p->fp->count)
+ {
+ kfree(p->fp);
+ p->fp = NULL;
+ }
+ return 0;
+
+error:
+ scm_destroy(p);
+ return err;
+}
+
+int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
+{
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user *)msg->msg_control;
+ struct cmsghdr cmhdr;
+ int cmlen = CMSG_LEN(len);
+ int err;
+
+ if (MSG_CMSG_COMPAT & msg->msg_flags)
+ return put_cmsg_compat(msg, level, type, len, data);
+
+ if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return 0; /* XXX: return error? check spec. */
+ }
+ if (msg->msg_controllen < cmlen) {
+ msg->msg_flags |= MSG_CTRUNC;
+ cmlen = msg->msg_controllen;
+ }
+ cmhdr.cmsg_level = level;
+ cmhdr.cmsg_type = type;
+ cmhdr.cmsg_len = cmlen;
+
+ err = -EFAULT;
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+ goto out;
+ if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+ goto out;
+ cmlen = CMSG_SPACE(len);
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ err = 0;
+out:
+ return err;
+}
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user*)msg->msg_control;
+
+ int fdmax = 0;
+ int fdnum = scm->fp->count;
+ struct file **fp = scm->fp->fp;
+ int __user *cmfptr;
+ int err = 0, i;
+
+ if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ scm_detach_fds_compat(msg, scm);
+ return;
+ }
+
+ if (msg->msg_controllen > sizeof(struct cmsghdr))
+ fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
+ / sizeof(int));
+
+ if (fdnum < fdmax)
+ fdmax = fdnum;
+
+ for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
+ i++, cmfptr++)
+ {
+ int new_fd;
+ err = security_file_receive(fp[i]);
+ if (err)
+ break;
+ err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
+ ? O_CLOEXEC : 0);
+ if (err < 0)
+ break;
+ new_fd = err;
+ err = put_user(new_fd, cmfptr);
+ if (err) {
+ put_unused_fd(new_fd);
+ break;
+ }
+ /* Bump the usage count and install the file. */
+ get_file(fp[i]);
+ fd_install(new_fd, fp[i]);
+ }
+
+ if (i > 0)
+ {
+ int cmlen = CMSG_LEN(i*sizeof(int));
+ err = put_user(SOL_SOCKET, &cm->cmsg_level);
+ if (!err)
+ err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+ if (!err)
+ err = put_user(cmlen, &cm->cmsg_len);
+ if (!err) {
+ cmlen = CMSG_SPACE(i*sizeof(int));
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ }
+ }
+ if (i < fdnum || (fdnum && fdmax <= 0))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ /*
+ * All of the files that fit in the message have had their
+ * usage counts incremented, so we just free the list.
+ */
+ __scm_destroy(scm);
+}
+
+struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
+{
+ struct scm_fp_list *new_fpl;
+ int i;
+
+ if (!fpl)
+ return NULL;
+
+ new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
+ if (new_fpl) {
+ for (i=fpl->count-1; i>=0; i--)
+ get_file(fpl->fp[i]);
+ memcpy(new_fpl, fpl, sizeof(*fpl));
+ }
+ return new_fpl;
+}
+
+EXPORT_SYMBOL(__scm_destroy);
+EXPORT_SYMBOL(__scm_send);
+EXPORT_SYMBOL(put_cmsg);
+EXPORT_SYMBOL(scm_detach_fds);
+EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
new file mode 100644
index 0000000..8623492
--- /dev/null
+++ b/net/core/skb_dma_map.c
@@ -0,0 +1,66 @@
+/* skb_dma_map.c: DMA mapping helpers for socket buffers.
+ *
+ * Copyright (C) David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/skbuff.h>
+
+int skb_dma_map(struct device *dev, struct sk_buff *skb,
+ enum dma_data_direction dir)
+{
+ struct skb_shared_info *sp = skb_shinfo(skb);
+ dma_addr_t map;
+ int i;
+
+ map = dma_map_single(dev, skb->data,
+ skb_headlen(skb), dir);
+ if (dma_mapping_error(dev, map))
+ goto out_err;
+
+ sp->dma_maps[0] = map;
+ for (i = 0; i < sp->nr_frags; i++) {
+ skb_frag_t *fp = &sp->frags[i];
+
+ map = dma_map_page(dev, fp->page, fp->page_offset,
+ fp->size, dir);
+ if (dma_mapping_error(dev, map))
+ goto unwind;
+ sp->dma_maps[i + 1] = map;
+ }
+ sp->num_dma_maps = i + 1;
+
+ return 0;
+
+unwind:
+ while (--i >= 0) {
+ skb_frag_t *fp = &sp->frags[i];
+
+ dma_unmap_page(dev, sp->dma_maps[i + 1],
+ fp->size, dir);
+ }
+ dma_unmap_single(dev, sp->dma_maps[0],
+ skb_headlen(skb), dir);
+out_err:
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(skb_dma_map);
+
+void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
+ enum dma_data_direction dir)
+{
+ struct skb_shared_info *sp = skb_shinfo(skb);
+ int i;
+
+ dma_unmap_single(dev, sp->dma_maps[0],
+ skb_headlen(skb), dir);
+ for (i = 0; i < sp->nr_frags; i++) {
+ skb_frag_t *fp = &sp->frags[i];
+
+ dma_unmap_page(dev, sp->dma_maps[i + 1],
+ fp->size, dir);
+ }
+}
+EXPORT_SYMBOL(skb_dma_unmap);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
new file mode 100644
index 0000000..aa89c30
--- /dev/null
+++ b/net/core/skbuff.c
@@ -0,0 +1,2665 @@
+/*
+ * Routines having to do with the 'struct sk_buff' memory handlers.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
+ * Florian La Roche <rzsfl@rz.uni-sb.de>
+ *
+ * Fixes:
+ * Alan Cox : Fixed the worst of the load
+ * balancer bugs.
+ * Dave Platt : Interrupt stacking fix.
+ * Richard Kooijman : Timestamp fixes.
+ * Alan Cox : Changed buffer format.
+ * Alan Cox : destructor hook for AF_UNIX etc.
+ * Linus Torvalds : Better skb_clone.
+ * Alan Cox : Added skb_copy.
+ * Alan Cox : Added all the changed routines Linus
+ * only put in the headers
+ * Ray VanTassle : Fixed --skb->lock in free
+ * Alan Cox : skb_copy copy arp field
+ * Andi Kleen : slabified it.
+ * Robert Olsson : Removed skb_head_pool
+ *
+ * NOTE:
+ * The __skb_ routines should be called with interrupts
+ * disabled, or you better be *real* sure that the operation is atomic
+ * with respect to whatever list is being frobbed (e.g. via lock_sock()
+ * or via disabling bottom half handlers, etc).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * The functions in this file will not compile correctly with gcc 2.4.x
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_NET_CLS_ACT
+#include <net/pkt_sched.h>
+#endif
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/cache.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+
+#include <net/protocol.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "kmap_skb.h"
+
+static struct kmem_cache *skbuff_head_cache __read_mostly;
+static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+
+static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ put_page(buf->page);
+}
+
+static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ get_page(buf->page);
+}
+
+static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return 1;
+}
+
+
+/* Pipe buffer operations for a socket. */
+static struct pipe_buf_operations sock_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .confirm = generic_pipe_buf_confirm,
+ .release = sock_pipe_buf_release,
+ .steal = sock_pipe_buf_steal,
+ .get = sock_pipe_buf_get,
+};
+
+/*
+ * Keep out-of-line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always
+ * reliable.
+ */
+
+/**
+ * skb_over_panic - private function
+ * @skb: buffer
+ * @sz: size
+ * @here: address
+ *
+ * Out of line support code for skb_put(). Not user callable.
+ */
+void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+{
+ printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
+ "data:%p tail:%#lx end:%#lx dev:%s\n",
+ here, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
+ BUG();
+}
+
+/**
+ * skb_under_panic - private function
+ * @skb: buffer
+ * @sz: size
+ * @here: address
+ *
+ * Out of line support code for skb_push(). Not user callable.
+ */
+
+void skb_under_panic(struct sk_buff *skb, int sz, void *here)
+{
+ printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
+ "data:%p tail:%#lx end:%#lx dev:%s\n",
+ here, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
+ BUG();
+}
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
+
+/**
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @fclone: allocate from fclone cache instead of head cache
+ * and allocate a cloned (child) skb
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of size bytes. The object has a reference count of one.
+ * The return is the buffer. On a failure the return is %NULL.
+ *
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int fclone, int node)
+{
+ struct kmem_cache *cache;
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ u8 *data;
+
+ cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
+ if (!skb)
+ goto out;
+
+ size = SKB_DATA_ALIGN(size);
+ data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
+ gfp_mask, node);
+ if (!data)
+ goto nodata;
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = size + sizeof(struct sk_buff);
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ atomic_set(&shinfo->dataref, 1);
+ shinfo->nr_frags = 0;
+ shinfo->gso_size = 0;
+ shinfo->gso_segs = 0;
+ shinfo->gso_type = 0;
+ shinfo->ip6_frag_id = 0;
+ shinfo->frag_list = NULL;
+
+ if (fclone) {
+ struct sk_buff *child = skb + 1;
+ atomic_t *fclone_ref = (atomic_t *) (child + 1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ atomic_set(fclone_ref, 1);
+
+ child->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
+out:
+ return skb;
+nodata:
+ kmem_cache_free(cache, skb);
+ skb = NULL;
+ goto out;
+}
+
+/**
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has unspecified headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+ unsigned int length, gfp_t gfp_mask)
+{
+ int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+ struct sk_buff *skb;
+
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+ return skb;
+}
+
+struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
+{
+ int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+ struct page *page;
+
+ page = alloc_pages_node(node, gfp_mask, 0);
+ return page;
+}
+EXPORT_SYMBOL(__netdev_alloc_page);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+ int size)
+{
+ skb_fill_page_desc(skb, i, page, off, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += size;
+}
+EXPORT_SYMBOL(skb_add_rx_frag);
+
+/**
+ * dev_alloc_skb - allocate an skbuff for receiving
+ * @length: length to allocate
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has unspecified headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory. Although this function
+ * allocates memory it can be called from an interrupt.
+ */
+struct sk_buff *dev_alloc_skb(unsigned int length)
+{
+ /*
+ * There is more code here than it seems:
+ * __dev_alloc_skb is an inline
+ */
+ return __dev_alloc_skb(length, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(dev_alloc_skb);
+
+static void skb_drop_list(struct sk_buff **listp)
+{
+ struct sk_buff *list = *listp;
+
+ *listp = NULL;
+
+ do {
+ struct sk_buff *this = list;
+ list = list->next;
+ kfree_skb(this);
+ } while (list);
+}
+
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+ skb_drop_list(&skb_shinfo(skb)->frag_list);
+}
+
+static void skb_clone_fraglist(struct sk_buff *skb)
+{
+ struct sk_buff *list;
+
+ for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
+ skb_get(list);
+}
+
+static void skb_release_data(struct sk_buff *skb)
+{
+ if (!skb->cloned ||
+ !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &skb_shinfo(skb)->dataref)) {
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ put_page(skb_shinfo(skb)->frags[i].page);
+ }
+
+ if (skb_shinfo(skb)->frag_list)
+ skb_drop_fraglist(skb);
+
+ kfree(skb->head);
+ }
+}
+
+/*
+ * Free an skbuff by memory without cleaning the state.
+ */
+static void kfree_skbmem(struct sk_buff *skb)
+{
+ struct sk_buff *other;
+ atomic_t *fclone_ref;
+
+ switch (skb->fclone) {
+ case SKB_FCLONE_UNAVAILABLE:
+ kmem_cache_free(skbuff_head_cache, skb);
+ break;
+
+ case SKB_FCLONE_ORIG:
+ fclone_ref = (atomic_t *) (skb + 2);
+ if (atomic_dec_and_test(fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, skb);
+ break;
+
+ case SKB_FCLONE_CLONE:
+ fclone_ref = (atomic_t *) (skb + 1);
+ other = skb - 1;
+
+ /* The clone portion is available for
+ * fast-cloning again.
+ */
+ skb->fclone = SKB_FCLONE_UNAVAILABLE;
+
+ if (atomic_dec_and_test(fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, other);
+ break;
+ }
+}
+
+static void skb_release_head_state(struct sk_buff *skb)
+{
+ dst_release(skb->dst);
+#ifdef CONFIG_XFRM
+ secpath_put(skb->sp);
+#endif
+ if (skb->destructor) {
+ WARN_ON(in_irq());
+ skb->destructor(skb);
+ }
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ nf_conntrack_put(skb->nfct);
+ nf_conntrack_put_reasm(skb->nfct_reasm);
+#endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+ nf_bridge_put(skb->nf_bridge);
+#endif
+/* XXX: IS this still necessary? - JHS */
+#ifdef CONFIG_NET_SCHED
+ skb->tc_index = 0;
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = 0;
+#endif
+#endif
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb)
+{
+ skb_release_head_state(skb);
+ skb_release_data(skb);
+}
+
+/**
+ * __kfree_skb - private function
+ * @skb: buffer
+ *
+ * Free an sk_buff. Release anything attached to the buffer.
+ * Clean the state. This is an internal helper function. Users should
+ * always call kfree_skb
+ */
+
+void __kfree_skb(struct sk_buff *skb)
+{
+ skb_release_all(skb);
+ kfree_skbmem(skb);
+}
+
+/**
+ * kfree_skb - free an sk_buff
+ * @skb: buffer to free
+ *
+ * Drop a reference to the buffer and free it if the usage count has
+ * hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+ if (unlikely(!skb))
+ return;
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ __kfree_skb(skb);
+}
+
+/**
+ * skb_recycle_check - check if skb can be reused for receive
+ * @skb: buffer
+ * @skb_size: minimum receive buffer size
+ *
+ * Checks that the skb passed in is not shared or cloned, and
+ * that it is linear and its head portion at least as large as
+ * skb_size so that it can be recycled as a receive buffer.
+ * If these conditions are met, this function does any necessary
+ * reference count dropping and cleans up the skbuff as if it
+ * just came from __alloc_skb().
+ */
+int skb_recycle_check(struct sk_buff *skb, int skb_size)
+{
+ struct skb_shared_info *shinfo;
+
+ if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ return 0;
+
+ skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
+ if (skb_end_pointer(skb) - skb->head < skb_size)
+ return 0;
+
+ if (skb_shared(skb) || skb_cloned(skb))
+ return 0;
+
+ skb_release_head_state(skb);
+ shinfo = skb_shinfo(skb);
+ atomic_set(&shinfo->dataref, 1);
+ shinfo->nr_frags = 0;
+ shinfo->gso_size = 0;
+ shinfo->gso_segs = 0;
+ shinfo->gso_type = 0;
+ shinfo->ip6_frag_id = 0;
+ shinfo->frag_list = NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->data = skb->head + NET_SKB_PAD;
+ skb_reset_tail_pointer(skb);
+
+ return 1;
+}
+EXPORT_SYMBOL(skb_recycle_check);
+
+static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ new->tstamp = old->tstamp;
+ new->dev = old->dev;
+ new->transport_header = old->transport_header;
+ new->network_header = old->network_header;
+ new->mac_header = old->mac_header;
+ new->dst = dst_clone(old->dst);
+#ifdef CONFIG_INET
+ new->sp = secpath_get(old->sp);
+#endif
+ memcpy(new->cb, old->cb, sizeof(old->cb));
+ new->csum_start = old->csum_start;
+ new->csum_offset = old->csum_offset;
+ new->local_df = old->local_df;
+ new->pkt_type = old->pkt_type;
+ new->ip_summed = old->ip_summed;
+ skb_copy_queue_mapping(new, old);
+ new->priority = old->priority;
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ new->ipvs_property = old->ipvs_property;
+#endif
+ new->protocol = old->protocol;
+ new->mark = old->mark;
+ __nf_copy(new, old);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+ new->nf_trace = old->nf_trace;
+#endif
+#ifdef CONFIG_NET_SCHED
+ new->tc_index = old->tc_index;
+#ifdef CONFIG_NET_CLS_ACT
+ new->tc_verd = old->tc_verd;
+#endif
+#endif
+ new->vlan_tci = old->vlan_tci;
+
+ skb_copy_secmark(new, old);
+}
+
+static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+{
+#define C(x) n->x = skb->x
+
+ n->next = n->prev = NULL;
+ n->sk = NULL;
+ __copy_skb_header(n, skb);
+
+ C(len);
+ C(data_len);
+ C(mac_len);
+ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+ n->cloned = 1;
+ n->nohdr = 0;
+ n->destructor = NULL;
+ C(iif);
+ C(tail);
+ C(end);
+ C(head);
+ C(data);
+ C(truesize);
+#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
+ C(do_not_encrypt);
+#endif
+ atomic_set(&n->users, 1);
+
+ atomic_inc(&(skb_shinfo(skb)->dataref));
+ skb->cloned = 1;
+
+ return n;
+#undef C
+}
+
+/**
+ * skb_morph - morph one skb into another
+ * @dst: the skb to receive the contents
+ * @src: the skb to supply the contents
+ *
+ * This is identical to skb_clone except that the target skb is
+ * supplied by the user.
+ *
+ * The target skb is returned upon exit.
+ */
+struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
+{
+ skb_release_all(dst);
+ return __skb_clone(dst, src);
+}
+EXPORT_SYMBOL_GPL(skb_morph);
+
+/**
+ * skb_clone - duplicate an sk_buff
+ * @skb: buffer to clone
+ * @gfp_mask: allocation priority
+ *
+ * Duplicate an &sk_buff. The new one is not owned by a socket. Both
+ * copies share the same packet data but not structure. The new
+ * buffer has a reference count of 1. If the allocation fails the
+ * function returns %NULL otherwise the new buffer is returned.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ */
+
+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ struct sk_buff *n;
+
+ n = skb + 1;
+ if (skb->fclone == SKB_FCLONE_ORIG &&
+ n->fclone == SKB_FCLONE_UNAVAILABLE) {
+ atomic_t *fclone_ref = (atomic_t *) (n + 1);
+ n->fclone = SKB_FCLONE_CLONE;
+ atomic_inc(fclone_ref);
+ } else {
+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (!n)
+ return NULL;
+ n->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
+
+ return __skb_clone(n, skb);
+}
+
+static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+#ifndef NET_SKBUFF_DATA_USES_OFFSET
+ /*
+ * Shift between the two data areas in bytes
+ */
+ unsigned long offset = new->data - old->data;
+#endif
+
+ __copy_skb_header(new, old);
+
+#ifndef NET_SKBUFF_DATA_USES_OFFSET
+ /* {transport,network,mac}_header are relative to skb->head */
+ new->transport_header += offset;
+ new->network_header += offset;
+ new->mac_header += offset;
+#endif
+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
+}
+
+/**
+ * skb_copy - create private copy of an sk_buff
+ * @skb: buffer to copy
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and its data. This is used when the
+ * caller wishes to modify the data and needs a private copy of the
+ * data to alter. Returns %NULL on failure or the pointer to the buffer
+ * on success. The returned buffer has a reference count of 1.
+ *
+ * As by-product this function converts non-linear &sk_buff to linear
+ * one, so that &sk_buff becomes completely private and caller is allowed
+ * to modify all the data of returned buffer. This means that this
+ * function is not recommended for use in circumstances when only
+ * header is going to be modified. Use pskb_copy() instead.
+ */
+
+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int headerlen = skb->data - skb->head;
+ /*
+ * Allocate the copy buffer
+ */
+ struct sk_buff *n;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ n = alloc_skb(skb->end + skb->data_len, gfp_mask);
+#else
+ n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
+#endif
+ if (!n)
+ return NULL;
+
+ /* Set the data pointer */
+ skb_reserve(n, headerlen);
+ /* Set the tail pointer and length */
+ skb_put(n, skb->len);
+
+ if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
+ BUG();
+
+ copy_skb_header(n, skb);
+ return n;
+}
+
+
+/**
+ * pskb_copy - create copy of an sk_buff with private head.
+ * @skb: buffer to copy
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and part of its data, located
+ * in header. Fragmented data remain shared. This is used when
+ * the caller wishes to modify only header of &sk_buff and needs
+ * private copy of the header to alter. Returns %NULL on failure
+ * or the pointer to the buffer on success.
+ * The returned buffer has a reference count of 1.
+ */
+
+struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ /*
+ * Allocate the copy buffer
+ */
+ struct sk_buff *n;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ n = alloc_skb(skb->end, gfp_mask);
+#else
+ n = alloc_skb(skb->end - skb->head, gfp_mask);
+#endif
+ if (!n)
+ goto out;
+
+ /* Set the data pointer */
+ skb_reserve(n, skb->data - skb->head);
+ /* Set the tail pointer and length */
+ skb_put(n, skb_headlen(skb));
+ /* Copy the bytes */
+ skb_copy_from_linear_data(skb, n->data, n->len);
+
+ n->truesize += skb->data_len;
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
+ get_page(skb_shinfo(n)->frags[i].page);
+ }
+ skb_shinfo(n)->nr_frags = i;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
+ skb_clone_fraglist(n);
+ }
+
+ copy_skb_header(n, skb);
+out:
+ return n;
+}
+
+/**
+ * pskb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @nhead: room to add at head
+ * @ntail: room to add at tail
+ * @gfp_mask: allocation priority
+ *
+ * Expands (or creates identical copy, if &nhead and &ntail are zero)
+ * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
+ * reference count of 1. Returns zero in the case of success or error,
+ * if expansion failed. In the last case, &sk_buff is not changed.
+ *
+ * All the pointers pointing into skb header may change and must be
+ * reloaded after call to this function.
+ */
+
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ gfp_t gfp_mask)
+{
+ int i;
+ u8 *data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ int size = nhead + skb->end + ntail;
+#else
+ int size = nhead + (skb->end - skb->head) + ntail;
+#endif
+ long off;
+
+ BUG_ON(nhead < 0);
+
+ if (skb_shared(skb))
+ BUG();
+
+ size = SKB_DATA_ALIGN(size);
+
+ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+ if (!data)
+ goto nodata;
+
+ /* Copy only real data... and, alas, header. This should be
+ * optimized for the cases when header is void. */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ memcpy(data + nhead, skb->head, skb->tail);
+#else
+ memcpy(data + nhead, skb->head, skb->tail - skb->head);
+#endif
+ memcpy(data + size, skb_end_pointer(skb),
+ sizeof(struct skb_shared_info));
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ get_page(skb_shinfo(skb)->frags[i].page);
+
+ if (skb_shinfo(skb)->frag_list)
+ skb_clone_fraglist(skb);
+
+ skb_release_data(skb);
+
+ off = (data + nhead) - skb->head;
+
+ skb->head = data;
+ skb->data += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+ off = nhead;
+#else
+ skb->end = skb->head + size;
+#endif
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->tail += off;
+ skb->transport_header += off;
+ skb->network_header += off;
+ skb->mac_header += off;
+ skb->csum_start += nhead;
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+ return 0;
+
+nodata:
+ return -ENOMEM;
+}
+
+/* Make private copy of skb with writable head and some headroom */
+
+struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
+{
+ struct sk_buff *skb2;
+ int delta = headroom - skb_headroom(skb);
+
+ if (delta <= 0)
+ skb2 = pskb_copy(skb, GFP_ATOMIC);
+ else {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
+ GFP_ATOMIC)) {
+ kfree_skb(skb2);
+ skb2 = NULL;
+ }
+ }
+ return skb2;
+}
+
+
+/**
+ * skb_copy_expand - copy and expand sk_buff
+ * @skb: buffer to copy
+ * @newheadroom: new free bytes at head
+ * @newtailroom: new free bytes at tail
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and its data and while doing so
+ * allocate additional space.
+ *
+ * This is used when the caller wishes to modify the data and needs a
+ * private copy of the data to alter as well as more space for new fields.
+ * Returns %NULL on failure or the pointer to the buffer
+ * on success. The returned buffer has a reference count of 1.
+ *
+ * You must pass %GFP_ATOMIC as the allocation priority if this function
+ * is called from an interrupt.
+ */
+struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
+ int newheadroom, int newtailroom,
+ gfp_t gfp_mask)
+{
+ /*
+ * Allocate the copy buffer
+ */
+ struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask);
+ int oldheadroom = skb_headroom(skb);
+ int head_copy_len, head_copy_off;
+ int off;
+
+ if (!n)
+ return NULL;
+
+ skb_reserve(n, newheadroom);
+
+ /* Set the tail pointer and length */
+ skb_put(n, skb->len);
+
+ head_copy_len = oldheadroom;
+ head_copy_off = 0;
+ if (newheadroom <= head_copy_len)
+ head_copy_len = newheadroom;
+ else
+ head_copy_off = newheadroom - head_copy_len;
+
+ /* Copy the linear header and data. */
+ if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+ skb->len + head_copy_len))
+ BUG();
+
+ copy_skb_header(n, skb);
+
+ off = newheadroom - oldheadroom;
+ n->csum_start += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ n->transport_header += off;
+ n->network_header += off;
+ n->mac_header += off;
+#endif
+
+ return n;
+}
+
+/**
+ * skb_pad - zero pad the tail of an skb
+ * @skb: buffer to pad
+ * @pad: space to pad
+ *
+ * Ensure that a buffer is followed by a padding area that is zero
+ * filled. Used by network drivers which may DMA or transfer data
+ * beyond the buffer end onto the wire.
+ *
+ * May return error in out of memory cases. The skb is freed on error.
+ */
+
+int skb_pad(struct sk_buff *skb, int pad)
+{
+ int err;
+ int ntail;
+
+ /* If the skbuff is non linear tailroom is always zero.. */
+ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
+ memset(skb->data+skb->len, 0, pad);
+ return 0;
+ }
+
+ ntail = skb->data_len + pad - (skb->end - skb->tail);
+ if (likely(skb_cloned(skb) || ntail > 0)) {
+ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+ if (unlikely(err))
+ goto free_skb;
+ }
+
+ /* FIXME: The use of this function with non-linear skb's really needs
+ * to be audited.
+ */
+ err = skb_linearize(skb);
+ if (unlikely(err))
+ goto free_skb;
+
+ memset(skb->data + skb->len, 0, pad);
+ return 0;
+
+free_skb:
+ kfree_skb(skb);
+ return err;
+}
+
+/**
+ * skb_put - add data to a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer. If this would
+ * exceed the total buffer size the kernel will panic. A pointer to the
+ * first byte of the extra data is returned.
+ */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
+{
+ unsigned char *tmp = skb_tail_pointer(skb);
+ SKB_LINEAR_ASSERT(skb);
+ skb->tail += len;
+ skb->len += len;
+ if (unlikely(skb->tail > skb->end))
+ skb_over_panic(skb, len, __builtin_return_address(0));
+ return tmp;
+}
+EXPORT_SYMBOL(skb_put);
+
+/**
+ * skb_push - add data to the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer at the buffer
+ * start. If this would exceed the total buffer headroom the kernel will
+ * panic. A pointer to the first byte of the extra data is returned.
+ */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
+{
+ skb->data -= len;
+ skb->len += len;
+ if (unlikely(skb->data<skb->head))
+ skb_under_panic(skb, len, __builtin_return_address(0));
+ return skb->data;
+}
+EXPORT_SYMBOL(skb_push);
+
+/**
+ * skb_pull - remove data from the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the next data in the buffer
+ * is returned. Once the data has been pulled future pushes will overwrite
+ * the old data.
+ */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
+{
+ return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
+}
+EXPORT_SYMBOL(skb_pull);
+
+/**
+ * skb_trim - remove end from a buffer
+ * @skb: buffer to alter
+ * @len: new length
+ *
+ * Cut the length of a buffer down by removing data from the tail. If
+ * the buffer is already under the length specified it is not modified.
+ * The skb must be linear.
+ */
+void skb_trim(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->len > len)
+ __skb_trim(skb, len);
+}
+EXPORT_SYMBOL(skb_trim);
+
+/* Trims skb to length len. It can change skb pointers.
+ */
+
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
+{
+ struct sk_buff **fragp;
+ struct sk_buff *frag;
+ int offset = skb_headlen(skb);
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ int i;
+ int err;
+
+ if (skb_cloned(skb) &&
+ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
+ return err;
+
+ i = 0;
+ if (offset >= len)
+ goto drop_pages;
+
+ for (; i < nfrags; i++) {
+ int end = offset + skb_shinfo(skb)->frags[i].size;
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ skb_shinfo(skb)->frags[i++].size = len - offset;
+
+drop_pages:
+ skb_shinfo(skb)->nr_frags = i;
+
+ for (; i < nfrags; i++)
+ put_page(skb_shinfo(skb)->frags[i].page);
+
+ if (skb_shinfo(skb)->frag_list)
+ skb_drop_fraglist(skb);
+ goto done;
+ }
+
+ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
+ fragp = &frag->next) {
+ int end = offset + frag->len;
+
+ if (skb_shared(frag)) {
+ struct sk_buff *nfrag;
+
+ nfrag = skb_clone(frag, GFP_ATOMIC);
+ if (unlikely(!nfrag))
+ return -ENOMEM;
+
+ nfrag->next = frag->next;
+ kfree_skb(frag);
+ frag = nfrag;
+ *fragp = frag;
+ }
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ if (end > len &&
+ unlikely((err = pskb_trim(frag, len - offset))))
+ return err;
+
+ if (frag->next)
+ skb_drop_list(&frag->next);
+ break;
+ }
+
+done:
+ if (len > skb_headlen(skb)) {
+ skb->data_len -= skb->len - len;
+ skb->len = len;
+ } else {
+ skb->len = len;
+ skb->data_len = 0;
+ skb_set_tail_pointer(skb, len);
+ }
+
+ return 0;
+}
+
+/**
+ * __pskb_pull_tail - advance tail of skb header
+ * @skb: buffer to reallocate
+ * @delta: number of bytes to advance tail
+ *
+ * The function makes a sense only on a fragmented &sk_buff,
+ * it expands header moving its tail forward and copying necessary
+ * data from fragmented part.
+ *
+ * &sk_buff MUST have reference count of 1.
+ *
+ * Returns %NULL (and &sk_buff does not change) if pull failed
+ * or value of new tail of skb in the case of success.
+ *
+ * All the pointers pointing into skb header may change and must be
+ * reloaded after call to this function.
+ */
+
+/* Moves tail of skb head forward, copying data from fragmented part,
+ * when it is necessary.
+ * 1. It may fail due to malloc failure.
+ * 2. It may change skb pointers.
+ *
+ * It is pretty complicated. Luckily, it is called only in exceptional cases.
+ */
+unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
+{
+ /* If skb has not enough free space at tail, get new one
+ * plus 128 bytes for future expansions. If we have enough
+ * room at tail, reallocate without expansion only if skb is cloned.
+ */
+ int i, k, eat = (skb->tail + delta) - skb->end;
+
+ if (eat > 0 || skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
+ GFP_ATOMIC))
+ return NULL;
+ }
+
+ if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
+ BUG();
+
+ /* Optimization: no fragments, no reasons to preestimate
+ * size of pulled pages. Superb.
+ */
+ if (!skb_shinfo(skb)->frag_list)
+ goto pull_pages;
+
+ /* Estimate size of pulled pages. */
+ eat = delta;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ if (skb_shinfo(skb)->frags[i].size >= eat)
+ goto pull_pages;
+ eat -= skb_shinfo(skb)->frags[i].size;
+ }
+
+ /* If we need update frag list, we are in troubles.
+ * Certainly, it possible to add an offset to skb data,
+ * but taking into account that pulling is expected to
+ * be very rare operation, it is worth to fight against
+ * further bloating skb head and crucify ourselves here instead.
+ * Pure masohism, indeed. 8)8)
+ */
+ if (eat) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ BUG_ON(!list);
+
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+
+ if (skb_shared(list)) {
+ /* Sucks! We need to fork list. :-( */
+ clone = skb_clone(list, GFP_ATOMIC);
+ if (!clone)
+ return NULL;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without
+ * problems. */
+ insp = list;
+ }
+ if (!pskb_pull(list, eat)) {
+ if (clone)
+ kfree_skb(clone);
+ return NULL;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = skb_shinfo(skb)->frag_list) != insp) {
+ skb_shinfo(skb)->frag_list = list->next;
+ kfree_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ skb_shinfo(skb)->frag_list = clone;
+ }
+ }
+ /* Success! Now we may commit changes to skb data. */
+
+pull_pages:
+ eat = delta;
+ k = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ if (skb_shinfo(skb)->frags[i].size <= eat) {
+ put_page(skb_shinfo(skb)->frags[i].page);
+ eat -= skb_shinfo(skb)->frags[i].size;
+ } else {
+ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ if (eat) {
+ skb_shinfo(skb)->frags[k].page_offset += eat;
+ skb_shinfo(skb)->frags[k].size -= eat;
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ skb_shinfo(skb)->nr_frags = k;
+
+ skb->tail += delta;
+ skb->data_len -= delta;
+
+ return skb_tail_pointer(skb);
+}
+
+/* Copy some data bits from skb to kernel buffer. */
+
+int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
+{
+ int i, copy;
+ int start = skb_headlen(skb);
+
+ if (offset > (int)skb->len - len)
+ goto fault;
+
+ /* Copy header. */
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ skb_copy_from_linear_data_offset(skb, offset, to, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ memcpy(to,
+ vaddr + skb_shinfo(skb)->frags[i].page_offset+
+ offset - start, copy);
+ kunmap_skb_frag(vaddr);
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_bits(list, offset - start,
+ to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static inline struct page *linear_to_page(struct page *page, unsigned int len,
+ unsigned int offset)
+{
+ struct page *p = alloc_pages(GFP_KERNEL, 0);
+
+ if (!p)
+ return NULL;
+ memcpy(page_address(p) + offset, page_address(page) + offset, len);
+
+ return p;
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+ unsigned int len, unsigned int offset,
+ struct sk_buff *skb, int linear)
+{
+ if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+ return 1;
+
+ if (linear) {
+ page = linear_to_page(page, len, offset);
+ if (!page)
+ return 1;
+ } else
+ get_page(page);
+
+ spd->pages[spd->nr_pages] = page;
+ spd->partial[spd->nr_pages].len = len;
+ spd->partial[spd->nr_pages].offset = offset;
+ spd->nr_pages++;
+
+ return 0;
+}
+
+static inline void __segment_seek(struct page **page, unsigned int *poff,
+ unsigned int *plen, unsigned int off)
+{
+ *poff += off;
+ *page += *poff / PAGE_SIZE;
+ *poff = *poff % PAGE_SIZE;
+ *plen -= off;
+}
+
+static inline int __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len, struct sk_buff *skb,
+ struct splice_pipe_desc *spd, int linear)
+{
+ if (!*len)
+ return 1;
+
+ /* skip this segment if already processed */
+ if (*off >= plen) {
+ *off -= plen;
+ return 0;
+ }
+
+ /* ignore any bits we already processed */
+ if (*off) {
+ __segment_seek(&page, &poff, &plen, *off);
+ *off = 0;
+ }
+
+ do {
+ unsigned int flen = min(*len, plen);
+
+ /* the linear region may spread across several pages */
+ flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
+
+ if (spd_fill_page(spd, page, flen, poff, skb, linear))
+ return 1;
+
+ __segment_seek(&page, &poff, &plen, flen);
+ *len -= flen;
+
+ } while (*len && plen);
+
+ return 0;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. It reports failure if the
+ * pipe is full or if we already spliced the requested length.
+ */
+static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
+ unsigned int *len,
+ struct splice_pipe_desc *spd)
+{
+ int seg;
+
+ /*
+ * map the linear part
+ */
+ if (__splice_segment(virt_to_page(skb->data),
+ (unsigned long) skb->data & (PAGE_SIZE - 1),
+ skb_headlen(skb),
+ offset, len, skb, spd, 1))
+ return 1;
+
+ /*
+ * then map the fragments
+ */
+ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+ if (__splice_segment(f->page, f->page_offset, f->size,
+ offset, len, skb, spd, 0))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list. It does NOT handle frag lists within
+ * the frag list, if such a thing exists. We'd probably need to recurse to
+ * handle that cleanly.
+ */
+int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+ struct pipe_inode_info *pipe, unsigned int tlen,
+ unsigned int flags)
+{
+ struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_BUFFERS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &sock_pipe_buf_ops,
+ .spd_release = sock_spd_release,
+ };
+
+ /*
+ * __skb_splice_bits() only fails if the output has no room left,
+ * so no point in going over the frag_list for the error case.
+ */
+ if (__skb_splice_bits(skb, &offset, &tlen, &spd))
+ goto done;
+ else if (!tlen)
+ goto done;
+
+ /*
+ * now see if we have a frag_list to map
+ */
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list && tlen; list = list->next) {
+ if (__skb_splice_bits(list, &offset, &tlen, &spd))
+ break;
+ }
+ }
+
+done:
+ if (spd.nr_pages) {
+ struct sock *sk = skb->sk;
+ int ret;
+
+ /*
+ * Drop the socket lock, otherwise we have reverse
+ * locking dependencies between sk_lock and i_mutex
+ * here as compared to sendfile(). We enter here
+ * with the socket lock held, and splice_to_pipe() will
+ * grab the pipe inode lock. For sendfile() emulation,
+ * we call into ->sendpage() with the i_mutex lock held
+ * and networking will grab the socket lock.
+ */
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, &spd);
+ lock_sock(sk);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * skb_store_bits - store bits from kernel buffer to skb
+ * @skb: destination buffer
+ * @offset: offset in destination
+ * @from: source buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source buffer to the
+ * destination skb. This function handles all the messy bits of
+ * traversing fragment lists and such.
+ */
+
+int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
+{
+ int i, copy;
+ int start = skb_headlen(skb);
+
+ if (offset > (int)skb->len - len)
+ goto fault;
+
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ skb_copy_to_linear_data_offset(skb, offset, from, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag->size;
+ if ((copy = end - offset) > 0) {
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ vaddr = kmap_skb_frag(frag);
+ memcpy(vaddr + frag->page_offset + offset - start,
+ from, copy);
+ kunmap_skb_frag(vaddr);
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_store_bits(list, offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+
+EXPORT_SYMBOL(skb_store_bits);
+
+/* Checksum skb data. */
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+ int len, __wsum csum)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ int pos = 0;
+
+ /* Checksum header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ csum = csum_partial(skb->data + offset, copy, csum);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap_skb_frag(frag);
+ csum2 = csum_partial(vaddr + frag->page_offset +
+ offset - start, copy, 0);
+ kunmap_skb_frag(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ if (!(len -= copy))
+ return csum;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ if (copy > len)
+ copy = len;
+ csum2 = skb_checksum(list, offset - start,
+ copy, 0);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ }
+ BUG_ON(len);
+
+ return csum;
+}
+
+/* Both of above in one bottle. */
+
+__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
+ u8 *to, int len, __wsum csum)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ int pos = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ csum = csum_partial_copy_nocheck(skb->data + offset, to,
+ copy, csum);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap_skb_frag(frag);
+ csum2 = csum_partial_copy_nocheck(vaddr +
+ frag->page_offset +
+ offset - start, to,
+ copy, 0);
+ kunmap_skb_frag(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ if (!(len -= copy))
+ return csum;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ __wsum csum2;
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ csum2 = skb_copy_and_csum_bits(list,
+ offset - start,
+ to, copy, 0);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ }
+ BUG_ON(len);
+ return csum;
+}
+
+void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
+{
+ __wsum csum;
+ long csstart;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ csstart = skb->csum_start - skb_headroom(skb);
+ else
+ csstart = skb_headlen(skb);
+
+ BUG_ON(csstart > skb_headlen(skb));
+
+ skb_copy_from_linear_data(skb, to, csstart);
+
+ csum = 0;
+ if (csstart != skb->len)
+ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
+ skb->len - csstart, 0);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ long csstuff = csstart + skb->csum_offset;
+
+ *((__sum16 *)(to + csstuff)) = csum_fold(csum);
+ }
+}
+
+/**
+ * skb_dequeue - remove from the head of the queue
+ * @list: list to dequeue from
+ *
+ * Remove the head of the list. The list lock is taken so the function
+ * may be used safely with other locking list functions. The head item is
+ * returned or %NULL if the list is empty.
+ */
+
+struct sk_buff *skb_dequeue(struct sk_buff_head *list)
+{
+ unsigned long flags;
+ struct sk_buff *result;
+
+ spin_lock_irqsave(&list->lock, flags);
+ result = __skb_dequeue(list);
+ spin_unlock_irqrestore(&list->lock, flags);
+ return result;
+}
+
+/**
+ * skb_dequeue_tail - remove from the tail of the queue
+ * @list: list to dequeue from
+ *
+ * Remove the tail of the list. The list lock is taken so the function
+ * may be used safely with other locking list functions. The tail item is
+ * returned or %NULL if the list is empty.
+ */
+struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
+{
+ unsigned long flags;
+ struct sk_buff *result;
+
+ spin_lock_irqsave(&list->lock, flags);
+ result = __skb_dequeue_tail(list);
+ spin_unlock_irqrestore(&list->lock, flags);
+ return result;
+}
+
+/**
+ * skb_queue_purge - empty a list
+ * @list: list to empty
+ *
+ * Delete all buffers on an &sk_buff list. Each buffer is removed from
+ * the list and one reference dropped. This function takes the list
+ * lock and is atomic with respect to other list locking functions.
+ */
+void skb_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(list)) != NULL)
+ kfree_skb(skb);
+}
+
+/**
+ * skb_queue_head - queue a buffer at the list head
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the start of the list. This function takes the
+ * list lock and can be used safely with other locking &sk_buff functions
+ * safely.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_head(list, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+
+/**
+ * skb_queue_tail - queue a buffer at the list tail
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the tail of the list. This function takes the
+ * list lock and can be used safely with other locking &sk_buff functions
+ * safely.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_tail(list, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+
+/**
+ * skb_unlink - remove a buffer from a list
+ * @skb: buffer to remove
+ * @list: list to use
+ *
+ * Remove a packet from a list. The list locks are taken and this
+ * function is atomic with respect to other list locked calls
+ *
+ * You must know what list the SKB is on.
+ */
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_unlink(skb, list);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+
+/**
+ * skb_append - append a buffer
+ * @old: buffer to insert after
+ * @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet after a given packet in a list. The list locks are taken
+ * and this function is atomic with respect to other list locked calls.
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_after(list, old, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+
+
+/**
+ * skb_insert - insert a buffer
+ * @old: buffer to insert before
+ * @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet before a given packet in a list. The list locks are
+ * taken and this function is atomic with respect to other list locked
+ * calls.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_insert(newsk, old->prev, old, list);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+
+static inline void skb_split_inside_header(struct sk_buff *skb,
+ struct sk_buff* skb1,
+ const u32 len, const int pos)
+{
+ int i;
+
+ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
+ pos - len);
+ /* And move data appendix as is. */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+
+ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb_shinfo(skb)->nr_frags = 0;
+ skb1->data_len = skb->data_len;
+ skb1->len += skb1->data_len;
+ skb->data_len = 0;
+ skb->len = len;
+ skb_set_tail_pointer(skb, len);
+}
+
+static inline void skb_split_no_header(struct sk_buff *skb,
+ struct sk_buff* skb1,
+ const u32 len, int pos)
+{
+ int i, k = 0;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+
+ skb_shinfo(skb)->nr_frags = 0;
+ skb1->len = skb1->data_len = skb->len - len;
+ skb->len = len;
+ skb->data_len = len - pos;
+
+ for (i = 0; i < nfrags; i++) {
+ int size = skb_shinfo(skb)->frags[i].size;
+
+ if (pos + size > len) {
+ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < len) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ get_page(skb_shinfo(skb)->frags[i].page);
+ skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+ skb_shinfo(skb1)->frags[0].size -= len - pos;
+ skb_shinfo(skb)->frags[i].size = len - pos;
+ skb_shinfo(skb)->nr_frags++;
+ }
+ k++;
+ } else
+ skb_shinfo(skb)->nr_frags++;
+ pos += size;
+ }
+ skb_shinfo(skb1)->nr_frags = k;
+}
+
+/**
+ * skb_split - Split fragmented skb to two parts at length len.
+ * @skb: the buffer to split
+ * @skb1: the buffer to receive the second part
+ * @len: new length for skb
+ */
+void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
+{
+ int pos = skb_headlen(skb);
+
+ if (len < pos) /* Split line is inside header. */
+ skb_split_inside_header(skb, skb1, len, pos);
+ else /* Second chunk has no header, nothing to copy. */
+ skb_split_no_header(skb, skb1, len, pos);
+}
+
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct skb_seq_state *st)
+{
+ st->lower_offset = from;
+ st->upper_offset = to;
+ st->root_skb = st->cur_skb = skb;
+ st->frag_idx = st->stepped_offset = 0;
+ st->frag_data = NULL;
+}
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note 1: The size of each block of data returned can be arbitary,
+ * this limitation is the cost for zerocopy seqeuental
+ * reads of potentially non linear data.
+ *
+ * Note 2: Fragment lists within fragments are not implemented
+ * at the moment, state->root_skb could be replaced with
+ * a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+ struct skb_seq_state *st)
+{
+ unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+ skb_frag_t *frag;
+
+ if (unlikely(abs_offset >= st->upper_offset))
+ return 0;
+
+next_skb:
+ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
+
+ if (abs_offset < block_limit) {
+ *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_idx == 0 && !st->frag_data)
+ st->stepped_offset += skb_headlen(st->cur_skb);
+
+ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+ block_limit = frag->size + st->stepped_offset;
+
+ if (abs_offset < block_limit) {
+ if (!st->frag_data)
+ st->frag_data = kmap_skb_frag(frag);
+
+ *data = (u8 *) st->frag_data + frag->page_offset +
+ (abs_offset - st->stepped_offset);
+
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_data) {
+ kunmap_skb_frag(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ st->frag_idx++;
+ st->stepped_offset += frag->size;
+ }
+
+ if (st->frag_data) {
+ kunmap_skb_frag(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ if (st->root_skb == st->cur_skb &&
+ skb_shinfo(st->root_skb)->frag_list) {
+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+ st->frag_idx = 0;
+ goto next_skb;
+ } else if (st->cur_skb->next) {
+ st->cur_skb = st->cur_skb->next;
+ st->frag_idx = 0;
+ goto next_skb;
+ }
+
+ return 0;
+}
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+ if (st->frag_data)
+ kunmap_skb_frag(st->frag_data);
+}
+
+#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+ struct ts_config *conf,
+ struct ts_state *state)
+{
+ return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+ skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct ts_config *config,
+ struct ts_state *state)
+{
+ unsigned int ret;
+
+ config->get_next_block = skb_ts_get_next_block;
+ config->finish = skb_ts_finish;
+
+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+
+ ret = textsearch_find(config, state);
+ return (ret <= to - from ? ret : UINT_MAX);
+}
+
+/**
+ * skb_append_datato_frags: - append the user data to a skb
+ * @sk: sock structure
+ * @skb: skb structure to be appened with user data.
+ * @getfrag: call back function to be used for getting the user data
+ * @from: pointer to user message iov
+ * @length: length of the iov message
+ *
+ * Description: This procedure append the user data in the fragment part
+ * of the skb if any page alloc fails user this procedure returns -ENOMEM
+ */
+int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
+ int (*getfrag)(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length)
+{
+ int frg_cnt = 0;
+ skb_frag_t *frag = NULL;
+ struct page *page = NULL;
+ int copy, left;
+ int offset = 0;
+ int ret;
+
+ do {
+ /* Return error if we don't have space for new frag */
+ frg_cnt = skb_shinfo(skb)->nr_frags;
+ if (frg_cnt >= MAX_SKB_FRAGS)
+ return -EFAULT;
+
+ /* allocate a new page for next frag */
+ page = alloc_pages(sk->sk_allocation, 0);
+
+ /* If alloc_page fails just return failure and caller will
+ * free previous allocated pages by doing kfree_skb()
+ */
+ if (page == NULL)
+ return -ENOMEM;
+
+ /* initialize the next frag */
+ sk->sk_sndmsg_page = page;
+ sk->sk_sndmsg_off = 0;
+ skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
+ skb->truesize += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+
+ /* get the new initialized frag */
+ frg_cnt = skb_shinfo(skb)->nr_frags;
+ frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
+
+ /* copy the user data to page */
+ left = PAGE_SIZE - frag->page_offset;
+ copy = (length > left)? left : length;
+
+ ret = getfrag(from, (page_address(frag->page) +
+ frag->page_offset + frag->size),
+ offset, copy, 0, skb);
+ if (ret < 0)
+ return -EFAULT;
+
+ /* copy was successful so update the size parameters */
+ sk->sk_sndmsg_off += copy;
+ frag->size += copy;
+ skb->len += copy;
+ skb->data_len += copy;
+ offset += copy;
+ length -= copy;
+
+ } while (length > 0);
+
+ return 0;
+}
+
+/**
+ * skb_pull_rcsum - pull skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_pull on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_pull unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+ BUG_ON(len > skb->len);
+ skb->len -= len;
+ BUG_ON(skb->len < skb->data_len);
+ skb_postpull_rcsum(skb, skb->data, len);
+ return skb->data += len;
+}
+
+EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+
+/**
+ * skb_segment - Perform protocol segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function performs segmentation on the given skb. It returns
+ * a pointer to the first in a list of new skbs for the segments.
+ * In case of error it returns ERR_PTR(err).
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, int features)
+{
+ struct sk_buff *segs = NULL;
+ struct sk_buff *tail = NULL;
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+ unsigned int doffset = skb->data - skb_mac_header(skb);
+ unsigned int offset = doffset;
+ unsigned int headroom;
+ unsigned int len;
+ int sg = features & NETIF_F_SG;
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ int err = -ENOMEM;
+ int i = 0;
+ int pos;
+
+ __skb_push(skb, doffset);
+ headroom = skb_headroom(skb);
+ pos = skb_headlen(skb);
+
+ do {
+ struct sk_buff *nskb;
+ skb_frag_t *frag;
+ int hsize;
+ int k;
+ int size;
+
+ len = skb->len - offset;
+ if (len > mss)
+ len = mss;
+
+ hsize = skb_headlen(skb) - offset;
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
+ nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
+ if (unlikely(!nskb))
+ goto err;
+
+ if (segs)
+ tail->next = nskb;
+ else
+ segs = nskb;
+ tail = nskb;
+
+ __copy_skb_header(nskb, skb);
+ nskb->mac_len = skb->mac_len;
+
+ skb_reserve(nskb, headroom);
+ skb_reset_mac_header(nskb);
+ skb_set_network_header(nskb, skb->mac_len);
+ nskb->transport_header = (nskb->network_header +
+ skb_network_header_len(skb));
+ skb_copy_from_linear_data(skb, skb_put(nskb, doffset),
+ doffset);
+ if (!sg) {
+ nskb->ip_summed = CHECKSUM_NONE;
+ nskb->csum = skb_copy_and_csum_bits(skb, offset,
+ skb_put(nskb, len),
+ len, 0);
+ continue;
+ }
+
+ frag = skb_shinfo(nskb)->frags;
+ k = 0;
+
+ skb_copy_from_linear_data_offset(skb, offset,
+ skb_put(nskb, hsize), hsize);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ *frag = skb_shinfo(skb)->frags[i];
+ get_page(frag->page);
+ size = frag->size;
+
+ if (pos < offset) {
+ frag->page_offset += offset - pos;
+ frag->size -= offset - pos;
+ }
+
+ k++;
+
+ if (pos + size <= offset + len) {
+ i++;
+ pos += size;
+ } else {
+ frag->size -= pos + size - (offset + len);
+ break;
+ }
+
+ frag++;
+ }
+
+ skb_shinfo(nskb)->nr_frags = k;
+ nskb->data_len = len - hsize;
+ nskb->len += nskb->data_len;
+ nskb->truesize += nskb->data_len;
+ } while ((offset += len) < skb->len);
+
+ return segs;
+
+err:
+ while ((skb = segs)) {
+ segs = skb->next;
+ kfree_skb(skb);
+ }
+ return ERR_PTR(err);
+}
+
+EXPORT_SYMBOL_GPL(skb_segment);
+
+void __init skb_init(void)
+{
+ skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+ sizeof(struct sk_buff),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ (2*sizeof(struct sk_buff)) +
+ sizeof(atomic_t),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
+
+/**
+ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ * @skb: Socket buffer containing the buffers to be mapped
+ * @sg: The scatter-gather list to map into
+ * @offset: The offset into the buffer's contents to start mapping
+ * @len: Length of buffer space to be mapped
+ *
+ * Fill the specified scatter-gather list with mappings/pointers into a
+ * region of the buffer space attached to a socket buffer.
+ */
+static int
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ int elt = 0;
+
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ sg_set_buf(sg, skb->data + offset, copy);
+ elt++;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (copy > len)
+ copy = len;
+ sg_set_page(&sg[elt], frag->page, copy,
+ frag->page_offset+offset-start);
+ elt++;
+ if (!(len -= copy))
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ elt += __skb_to_sgvec(list, sg+elt, offset - start,
+ copy);
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+ }
+ BUG_ON(len);
+ return elt;
+}
+
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int nsg = __skb_to_sgvec(skb, sg, offset, len);
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ return nsg;
+}
+
+/**
+ * skb_cow_data - Check that a socket buffer's data buffers are writable
+ * @skb: The socket buffer to check.
+ * @tailbits: Amount of trailing space to be added
+ * @trailer: Returned pointer to the skb where the @tailbits space begins
+ *
+ * Make sure that the data buffers attached to a socket buffer are
+ * writable. If they are not, private copies are made of the data buffers
+ * and the socket buffer is set to use these instead.
+ *
+ * If @tailbits is given, make sure that there is space to write @tailbits
+ * bytes of data beyond current end of socket buffer. @trailer will be
+ * set to point to the skb in which this space begins.
+ *
+ * The number of scatterlist elements required to completely map the
+ * COW'd and extended socket buffer will be returned.
+ */
+int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
+{
+ int copyflag;
+ int elt;
+ struct sk_buff *skb1, **skb_p;
+
+ /* If skb is cloned or its head is paged, reallocate
+ * head pulling out all the pages (pages are considered not writable
+ * at the moment even if they are anonymous).
+ */
+ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
+ __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ /* Easy case. Most of packets will go this way. */
+ if (!skb_shinfo(skb)->frag_list) {
+ /* A little of trouble, not enough of space for trailer.
+ * This should not happen, when stack is tuned to generate
+ * good frames. OK, on miss we reallocate and reserve even more
+ * space, 128 bytes is fair. */
+
+ if (skb_tailroom(skb) < tailbits &&
+ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Voila! */
+ *trailer = skb;
+ return 1;
+ }
+
+ /* Misery. We are in troubles, going to mincer fragments... */
+
+ elt = 1;
+ skb_p = &skb_shinfo(skb)->frag_list;
+ copyflag = 0;
+
+ while ((skb1 = *skb_p) != NULL) {
+ int ntail = 0;
+
+ /* The fragment is partially pulled by someone,
+ * this can happen on input. Copy it and everything
+ * after it. */
+
+ if (skb_shared(skb1))
+ copyflag = 1;
+
+ /* If the skb is the last, worry about trailer. */
+
+ if (skb1->next == NULL && tailbits) {
+ if (skb_shinfo(skb1)->nr_frags ||
+ skb_shinfo(skb1)->frag_list ||
+ skb_tailroom(skb1) < tailbits)
+ ntail = tailbits + 128;
+ }
+
+ if (copyflag ||
+ skb_cloned(skb1) ||
+ ntail ||
+ skb_shinfo(skb1)->nr_frags ||
+ skb_shinfo(skb1)->frag_list) {
+ struct sk_buff *skb2;
+
+ /* Fuck, we are miserable poor guys... */
+ if (ntail == 0)
+ skb2 = skb_copy(skb1, GFP_ATOMIC);
+ else
+ skb2 = skb_copy_expand(skb1,
+ skb_headroom(skb1),
+ ntail,
+ GFP_ATOMIC);
+ if (unlikely(skb2 == NULL))
+ return -ENOMEM;
+
+ if (skb1->sk)
+ skb_set_owner_w(skb2, skb1->sk);
+
+ /* Looking around. Are we still alive?
+ * OK, link new skb, drop old one */
+
+ skb2->next = skb1->next;
+ *skb_p = skb2;
+ kfree_skb(skb1);
+ skb1 = skb2;
+ }
+ elt++;
+ *trailer = skb1;
+ skb_p = &skb1->next;
+ }
+
+ return elt;
+}
+
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+ if (unlikely(start > skb->len - 2) ||
+ unlikely((int)start + off > skb->len - 2)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "bad partial csum: csum=%u/%u len=%u\n",
+ start, off, skb->len);
+ return false;
+ }
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + start;
+ skb->csum_offset = off;
+ return true;
+}
+
+void __skb_warn_lro_forwarding(const struct sk_buff *skb)
+{
+ if (net_ratelimit())
+ pr_warning("%s: received packets cannot be forwarded"
+ " while LRO is enabled\n", skb->dev->name);
+}
+
+EXPORT_SYMBOL(___pskb_trim);
+EXPORT_SYMBOL(__kfree_skb);
+EXPORT_SYMBOL(kfree_skb);
+EXPORT_SYMBOL(__pskb_pull_tail);
+EXPORT_SYMBOL(__alloc_skb);
+EXPORT_SYMBOL(__netdev_alloc_skb);
+EXPORT_SYMBOL(pskb_copy);
+EXPORT_SYMBOL(pskb_expand_head);
+EXPORT_SYMBOL(skb_checksum);
+EXPORT_SYMBOL(skb_clone);
+EXPORT_SYMBOL(skb_copy);
+EXPORT_SYMBOL(skb_copy_and_csum_bits);
+EXPORT_SYMBOL(skb_copy_and_csum_dev);
+EXPORT_SYMBOL(skb_copy_bits);
+EXPORT_SYMBOL(skb_copy_expand);
+EXPORT_SYMBOL(skb_over_panic);
+EXPORT_SYMBOL(skb_pad);
+EXPORT_SYMBOL(skb_realloc_headroom);
+EXPORT_SYMBOL(skb_under_panic);
+EXPORT_SYMBOL(skb_dequeue);
+EXPORT_SYMBOL(skb_dequeue_tail);
+EXPORT_SYMBOL(skb_insert);
+EXPORT_SYMBOL(skb_queue_purge);
+EXPORT_SYMBOL(skb_queue_head);
+EXPORT_SYMBOL(skb_queue_tail);
+EXPORT_SYMBOL(skb_unlink);
+EXPORT_SYMBOL(skb_append);
+EXPORT_SYMBOL(skb_split);
+EXPORT_SYMBOL(skb_prepare_seq_read);
+EXPORT_SYMBOL(skb_seq_read);
+EXPORT_SYMBOL(skb_abort_seq_read);
+EXPORT_SYMBOL(skb_find_text);
+EXPORT_SYMBOL(skb_append_datato_frags);
+EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+EXPORT_SYMBOL_GPL(skb_cow_data);
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
diff --git a/net/core/sock.c b/net/core/sock.c
new file mode 100644
index 0000000..1b0f74c
--- /dev/null
+++ b/net/core/sock.c
@@ -0,0 +1,2271 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic socket support routines. Memory allocators, socket lock/release
+ * handler for protocols to use and generic option handler.
+ *
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() problems
+ * Alan Cox : Connecting on a connecting socket
+ * now returns an error for tcp.
+ * Alan Cox : sock->protocol is set correctly.
+ * and is not sometimes left as 0.
+ * Alan Cox : connect handles icmp errors on a
+ * connect properly. Unfortunately there
+ * is a restart syscall nasty there. I
+ * can't match BSD without hacking the C
+ * library. Ideas urgently sought!
+ * Alan Cox : Disallow bind() to addresses that are
+ * not ours - especially broadcast ones!!
+ * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
+ * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
+ * instead they leave that for the DESTROY timer.
+ * Alan Cox : Clean up error flag in accept
+ * Alan Cox : TCP ack handling is buggy, the DESTROY timer
+ * was buggy. Put a remove_sock() in the handler
+ * for memory when we hit 0. Also altered the timer
+ * code. The ACK stuff can wait and needs major
+ * TCP layer surgery.
+ * Alan Cox : Fixed TCP ack bug, removed remove sock
+ * and fixed timer/inet_bh race.
+ * Alan Cox : Added zapped flag for TCP
+ * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
+ * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
+ * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
+ * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
+ * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
+ * Rick Sladkey : Relaxed UDP rules for matching packets.
+ * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
+ * Pauline Middelink : identd support
+ * Alan Cox : Fixed connect() taking signals I think.
+ * Alan Cox : SO_LINGER supported
+ * Alan Cox : Error reporting fixes
+ * Anonymous : inet_create tidied up (sk->reuse setting)
+ * Alan Cox : inet sockets don't set sk->type!
+ * Alan Cox : Split socket option code
+ * Alan Cox : Callbacks
+ * Alan Cox : Nagle flag for Charles & Johannes stuff
+ * Alex : Removed restriction on inet fioctl
+ * Alan Cox : Splitting INET from NET core
+ * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
+ * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
+ * Alan Cox : Split IP from generic code
+ * Alan Cox : New kfree_skbmem()
+ * Alan Cox : Make SO_DEBUG superuser only.
+ * Alan Cox : Allow anyone to clear SO_DEBUG
+ * (compatibility fix)
+ * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
+ * Alan Cox : Allocator for a socket is settable.
+ * Alan Cox : SO_ERROR includes soft errors.
+ * Alan Cox : Allow NULL arguments on some SO_ opts
+ * Alan Cox : Generic socket allocation to make hooks
+ * easier (suggested by Craig Metz).
+ * Michael Pall : SO_ERROR returns positive errno again
+ * Steve Whitehouse: Added default destructor to free
+ * protocol private data.
+ * Steve Whitehouse: Added various other default routines
+ * common to several socket families.
+ * Chris Evans : Call suser() check last on F_SETOWN
+ * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
+ * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
+ * Andi Kleen : Fix write_space callback
+ * Chris Evans : Security fixes - signedness again
+ * Arnaldo C. Melo : cleanups, use skb_queue_purge
+ *
+ * To Fix:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/netdevice.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <linux/ipsec.h>
+
+#include <linux/filter.h>
+
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+#endif
+
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *af_family_key_strings[AF_MAX+1] = {
+ "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
+ "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
+ "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
+ "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
+ "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
+ "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
+ "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
+ "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
+ "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
+ "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
+ "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
+ "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
+ "sk_lock-AF_MAX"
+};
+static const char *af_family_slock_key_strings[AF_MAX+1] = {
+ "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
+ "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
+ "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
+ "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
+ "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
+ "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
+ "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
+ "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
+ "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
+ "slock-27" , "slock-28" , "slock-AF_CAN" ,
+ "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
+ "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
+ "slock-AF_MAX"
+};
+static const char *af_family_clock_key_strings[AF_MAX+1] = {
+ "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
+ "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
+ "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
+ "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
+ "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
+ "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
+ "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
+ "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
+ "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
+ "clock-27" , "clock-28" , "clock-AF_CAN" ,
+ "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
+ "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
+ "clock-AF_MAX"
+};
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
+/* Take into consideration the size of the struct sk_buff overhead in the
+ * determination of these values, since that is non-constant across
+ * platforms. This makes socket queueing behavior and performance
+ * not depend upon such differences.
+ */
+#define _SK_MEM_PACKETS 256
+#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
+#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+
+/* Run time adjustable parameters. */
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+
+/* Maximal space eaten by iovec or ancilliary data plus some space */
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+
+static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+{
+ struct timeval tv;
+
+ if (optlen < sizeof(tv))
+ return -EINVAL;
+ if (copy_from_user(&tv, optval, sizeof(tv)))
+ return -EFAULT;
+ if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+ return -EDOM;
+
+ if (tv.tv_sec < 0) {
+ static int warned __read_mostly;
+
+ *timeo_p = 0;
+ if (warned < 10 && net_ratelimit()) {
+ warned++;
+ printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
+ "tries to set negative timeout\n",
+ current->comm, task_pid_nr(current));
+ }
+ return 0;
+ }
+ *timeo_p = MAX_SCHEDULE_TIMEOUT;
+ if (tv.tv_sec == 0 && tv.tv_usec == 0)
+ return 0;
+ if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
+ *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+ return 0;
+}
+
+static void sock_warn_obsolete_bsdism(const char *name)
+{
+ static int warned;
+ static char warncomm[TASK_COMM_LEN];
+ if (strcmp(warncomm, current->comm) && warned < 5) {
+ strcpy(warncomm, current->comm);
+ printk(KERN_WARNING "process `%s' is using obsolete "
+ "%s SO_BSDCOMPAT\n", warncomm, name);
+ warned++;
+ }
+}
+
+static void sock_disable_timestamp(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_TIMESTAMP)) {
+ sock_reset_flag(sk, SOCK_TIMESTAMP);
+ net_disable_timestamp();
+ }
+}
+
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err = 0;
+ int skb_len;
+
+ /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
+ number of warnings when compiling with -W --ANK
+ */
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned)sk->sk_rcvbuf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_filter(sk, skb);
+ if (err)
+ goto out;
+
+ if (!sk_rmem_schedule(sk, skb->truesize)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+
+ /* Cache the SKB length before we tack it onto the receive
+ * queue. Once it is added it no longer belongs to us and
+ * may be freed by other threads of control pulling packets
+ * from the queue.
+ */
+ skb_len = skb->len;
+
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, skb_len);
+out:
+ return err;
+}
+EXPORT_SYMBOL(sock_queue_rcv_skb);
+
+int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+{
+ int rc = NET_RX_SUCCESS;
+
+ if (sk_filter(sk, skb))
+ goto discard_and_relse;
+
+ skb->dev = NULL;
+
+ if (nested)
+ bh_lock_sock_nested(sk);
+ else
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ /*
+ * trylock + unlock semantics:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+ rc = sk_backlog_rcv(sk, skb);
+
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+ } else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+out:
+ sock_put(sk);
+ return rc;
+discard_and_relse:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(sk_receive_skb);
+
+struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = sk->sk_dst_cache;
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk->sk_dst_cache = NULL;
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(__sk_dst_check);
+
+struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_dst_reset(sk);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(sk_dst_check);
+
+static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+ int index;
+
+ /* Sorry... */
+ ret = -EPERM;
+ if (!capable(CAP_NET_RAW))
+ goto out;
+
+ ret = -EINVAL;
+ if (optlen < 0)
+ goto out;
+
+ /* Bind this socket to a particular device like "eth0",
+ * as specified in the passed interface name. If the
+ * name is "" or the option length is zero the socket
+ * is not bound.
+ */
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
+ memset(devname, 0, sizeof(devname));
+
+ ret = -EFAULT;
+ if (copy_from_user(devname, optval, optlen))
+ goto out;
+
+ if (devname[0] == '\0') {
+ index = 0;
+ } else {
+ struct net_device *dev = dev_get_by_name(net, devname);
+
+ ret = -ENODEV;
+ if (!dev)
+ goto out;
+
+ index = dev->ifindex;
+ dev_put(dev);
+ }
+
+ lock_sock(sk);
+ sk->sk_bound_dev_if = index;
+ sk_dst_reset(sk);
+ release_sock(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
+}
+
+/*
+ * This is meant for all protocols to use and covers goings on
+ * at the socket level. Everything here is generic.
+ */
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct sock *sk=sock->sk;
+ int val;
+ int valbool;
+ struct linger ling;
+ int ret = 0;
+
+ /*
+ * Options without arguments
+ */
+
+ if (optname == SO_BINDTODEVICE)
+ return sock_bindtodevice(sk, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val?1:0;
+
+ lock_sock(sk);
+
+ switch(optname) {
+ case SO_DEBUG:
+ if (val && !capable(CAP_NET_ADMIN)) {
+ ret = -EACCES;
+ } else
+ sock_valbool_flag(sk, SOCK_DBG, valbool);
+ break;
+ case SO_REUSEADDR:
+ sk->sk_reuse = valbool;
+ break;
+ case SO_TYPE:
+ case SO_ERROR:
+ ret = -ENOPROTOOPT;
+ break;
+ case SO_DONTROUTE:
+ sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ break;
+ case SO_BROADCAST:
+ sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+ break;
+ case SO_SNDBUF:
+ /* Don't error on this BSD doesn't and if you think
+ about it this is right. Otherwise apps have to
+ play 'guess the biggest size' games. RCVBUF/SNDBUF
+ are treated in BSD as hints */
+
+ if (val > sysctl_wmem_max)
+ val = sysctl_wmem_max;
+set_sndbuf:
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ if ((val * 2) < SOCK_MIN_SNDBUF)
+ sk->sk_sndbuf = SOCK_MIN_SNDBUF;
+ else
+ sk->sk_sndbuf = val * 2;
+
+ /*
+ * Wake up sending tasks if we
+ * upped the value.
+ */
+ sk->sk_write_space(sk);
+ break;
+
+ case SO_SNDBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_sndbuf;
+
+ case SO_RCVBUF:
+ /* Don't error on this BSD doesn't and if you think
+ about it this is right. Otherwise apps have to
+ play 'guess the biggest size' games. RCVBUF/SNDBUF
+ are treated in BSD as hints */
+
+ if (val > sysctl_rmem_max)
+ val = sysctl_rmem_max;
+set_rcvbuf:
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ /*
+ * We double it on the way in to account for
+ * "struct sk_buff" etc. overhead. Applications
+ * assume that the SO_RCVBUF setting they make will
+ * allow that much actual data to be received on that
+ * socket.
+ *
+ * Applications are unaware that "struct sk_buff" and
+ * other overheads allocate from the receive buffer
+ * during socket buffer allocation.
+ *
+ * And after considering the possible alternatives,
+ * returning the value we actually used in getsockopt
+ * is the most desirable behavior.
+ */
+ if ((val * 2) < SOCK_MIN_RCVBUF)
+ sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
+ else
+ sk->sk_rcvbuf = val * 2;
+ break;
+
+ case SO_RCVBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_rcvbuf;
+
+ case SO_KEEPALIVE:
+#ifdef CONFIG_INET
+ if (sk->sk_protocol == IPPROTO_TCP)
+ tcp_set_keepalive(sk, valbool);
+#endif
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ break;
+
+ case SO_OOBINLINE:
+ sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+ break;
+
+ case SO_NO_CHECK:
+ sk->sk_no_check = valbool;
+ break;
+
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+ sk->sk_priority = val;
+ else
+ ret = -EPERM;
+ break;
+
+ case SO_LINGER:
+ if (optlen < sizeof(ling)) {
+ ret = -EINVAL; /* 1003.1g */
+ break;
+ }
+ if (copy_from_user(&ling,optval,sizeof(ling))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (!ling.l_onoff)
+ sock_reset_flag(sk, SOCK_LINGER);
+ else {
+#if (BITS_PER_LONG == 32)
+ if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ else
+#endif
+ sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+ sock_set_flag(sk, SOCK_LINGER);
+ }
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("setsockopt");
+ break;
+
+ case SO_PASSCRED:
+ if (valbool)
+ set_bit(SOCK_PASSCRED, &sock->flags);
+ else
+ clear_bit(SOCK_PASSCRED, &sock->flags);
+ break;
+
+ case SO_TIMESTAMP:
+ case SO_TIMESTAMPNS:
+ if (valbool) {
+ if (optname == SO_TIMESTAMP)
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ else
+ sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+ break;
+
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+
+ case SO_RCVTIMEO:
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ break;
+
+ case SO_SNDTIMEO:
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ break;
+
+ case SO_ATTACH_FILTER:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_DETACH_FILTER:
+ ret = sk_detach_filter(sk);
+ break;
+
+ case SO_PASSSEC:
+ if (valbool)
+ set_bit(SOCK_PASSSEC, &sock->flags);
+ else
+ clear_bit(SOCK_PASSSEC, &sock->flags);
+ break;
+ case SO_MARK:
+ if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else {
+ sk->sk_mark = val;
+ }
+ break;
+
+ /* We implement the SO_SNDLOWAT etc to
+ not be settable (1003.1g 5.3) */
+ default:
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+ return ret;
+}
+
+
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ union {
+ int val;
+ struct linger ling;
+ struct timeval tm;
+ } v;
+
+ unsigned int lv = sizeof(int);
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ memset(&v, 0, sizeof(v));
+
+ switch(optname) {
+ case SO_DEBUG:
+ v.val = sock_flag(sk, SOCK_DBG);
+ break;
+
+ case SO_DONTROUTE:
+ v.val = sock_flag(sk, SOCK_LOCALROUTE);
+ break;
+
+ case SO_BROADCAST:
+ v.val = !!sock_flag(sk, SOCK_BROADCAST);
+ break;
+
+ case SO_SNDBUF:
+ v.val = sk->sk_sndbuf;
+ break;
+
+ case SO_RCVBUF:
+ v.val = sk->sk_rcvbuf;
+ break;
+
+ case SO_REUSEADDR:
+ v.val = sk->sk_reuse;
+ break;
+
+ case SO_KEEPALIVE:
+ v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
+ break;
+
+ case SO_TYPE:
+ v.val = sk->sk_type;
+ break;
+
+ case SO_ERROR:
+ v.val = -sock_error(sk);
+ if (v.val==0)
+ v.val = xchg(&sk->sk_err_soft, 0);
+ break;
+
+ case SO_OOBINLINE:
+ v.val = !!sock_flag(sk, SOCK_URGINLINE);
+ break;
+
+ case SO_NO_CHECK:
+ v.val = sk->sk_no_check;
+ break;
+
+ case SO_PRIORITY:
+ v.val = sk->sk_priority;
+ break;
+
+ case SO_LINGER:
+ lv = sizeof(v.ling);
+ v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
+ v.ling.l_linger = sk->sk_lingertime / HZ;
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("getsockopt");
+ break;
+
+ case SO_TIMESTAMP:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPNS:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_RCVTIMEO:
+ lv=sizeof(struct timeval);
+ if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ lv=sizeof(struct timeval);
+ if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+ }
+ break;
+
+ case SO_RCVLOWAT:
+ v.val = sk->sk_rcvlowat;
+ break;
+
+ case SO_SNDLOWAT:
+ v.val=1;
+ break;
+
+ case SO_PASSCRED:
+ v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
+ break;
+
+ case SO_PEERCRED:
+ if (len > sizeof(sk->sk_peercred))
+ len = sizeof(sk->sk_peercred);
+ if (copy_to_user(optval, &sk->sk_peercred, len))
+ return -EFAULT;
+ goto lenout;
+
+ case SO_PEERNAME:
+ {
+ char address[128];
+
+ if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+ return -ENOTCONN;
+ if (lv < len)
+ return -EINVAL;
+ if (copy_to_user(optval, address, len))
+ return -EFAULT;
+ goto lenout;
+ }
+
+ /* Dubious BSD thing... Probably nobody even uses it, but
+ * the UNIX standard wants it for whatever reason... -DaveM
+ */
+ case SO_ACCEPTCONN:
+ v.val = sk->sk_state == TCP_LISTEN;
+ break;
+
+ case SO_PASSSEC:
+ v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
+ break;
+
+ case SO_PEERSEC:
+ return security_socket_getpeersec_stream(sock, optval, optlen, len);
+
+ case SO_MARK:
+ v.val = sk->sk_mark;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (len > lv)
+ len = lv;
+ if (copy_to_user(optval, &v, len))
+ return -EFAULT;
+lenout:
+ if (put_user(len, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+static inline void sock_lock_init(struct sock *sk)
+{
+ sock_lock_init_class_and_name(sk,
+ af_family_slock_key_strings[sk->sk_family],
+ af_family_slock_keys + sk->sk_family,
+ af_family_key_strings[sk->sk_family],
+ af_family_keys + sk->sk_family);
+}
+
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+ void *sptr = nsk->sk_security;
+#endif
+
+ memcpy(nsk, osk, osk->sk_prot->obj_size);
+#ifdef CONFIG_SECURITY_NETWORK
+ nsk->sk_security = sptr;
+ security_sk_clone(osk, nsk);
+#endif
+}
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+ int family)
+{
+ struct sock *sk;
+ struct kmem_cache *slab;
+
+ slab = prot->slab;
+ if (slab != NULL)
+ sk = kmem_cache_alloc(slab, priority);
+ else
+ sk = kmalloc(prot->obj_size, priority);
+
+ if (sk != NULL) {
+ if (security_sk_alloc(sk, family, priority))
+ goto out_free;
+
+ if (!try_module_get(prot->owner))
+ goto out_free_sec;
+ }
+
+ return sk;
+
+out_free_sec:
+ security_sk_free(sk);
+out_free:
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ return NULL;
+}
+
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+ struct kmem_cache *slab;
+ struct module *owner;
+
+ owner = prot->owner;
+ slab = prot->slab;
+
+ security_sk_free(sk);
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ module_put(owner);
+}
+
+/**
+ * sk_alloc - All socket objects are allocated here
+ * @net: the applicable net namespace
+ * @family: protocol family
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+ struct proto *prot)
+{
+ struct sock *sk;
+
+ sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+ if (sk) {
+ sk->sk_family = family;
+ /*
+ * See comment in struct sock definition to understand
+ * why we need sk_prot_creator -acme
+ */
+ sk->sk_prot = sk->sk_prot_creator = prot;
+ sock_lock_init(sk);
+ sock_net_set(sk, get_net(net));
+ }
+
+ return sk;
+}
+
+void sk_free(struct sock *sk)
+{
+ struct sk_filter *filter;
+
+ if (sk->sk_destruct)
+ sk->sk_destruct(sk);
+
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ sk_filter_uncharge(sk, filter);
+ rcu_assign_pointer(sk->sk_filter, NULL);
+ }
+
+ sock_disable_timestamp(sk);
+
+ if (atomic_read(&sk->sk_omem_alloc))
+ printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
+
+ put_net(sock_net(sk));
+ sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+/*
+ * Last sock_put should drop referrence to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking referrence to stopping namespace
+ * is not an option.
+ * Take referrence to a socket to remove it from hash _alive_ and after that
+ * destroy it in the context of init_net.
+ */
+void sk_release_kernel(struct sock *sk)
+{
+ if (sk == NULL || sk->sk_socket == NULL)
+ return;
+
+ sock_hold(sk);
+ sock_release(sk->sk_socket);
+ release_net(sock_net(sk));
+ sock_net_set(sk, get_net(&init_net));
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sk_release_kernel);
+
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+{
+ struct sock *newsk;
+
+ newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+ if (newsk != NULL) {
+ struct sk_filter *filter;
+
+ sock_copy(newsk, sk);
+
+ /* SANITY */
+ get_net(sock_net(newsk));
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+ atomic_set(&newsk->sk_wmem_alloc, 0);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ skb_queue_head_init(&newsk->sk_receive_queue);
+ skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+ skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
+
+ rwlock_init(&newsk->sk_dst_lock);
+ rwlock_init(&newsk->sk_callback_lock);
+ lockdep_set_class_and_name(&newsk->sk_callback_lock,
+ af_callback_keys + newsk->sk_family,
+ af_family_clock_key_strings[newsk->sk_family]);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+ sock_reset_flag(newsk, SOCK_DONE);
+ skb_queue_head_init(&newsk->sk_error_queue);
+
+ filter = newsk->sk_filter;
+ if (filter != NULL)
+ sk_filter_charge(newsk, filter);
+
+ if (unlikely(xfrm_sk_clone_policy(newsk))) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
+ }
+
+ newsk->sk_err = 0;
+ newsk->sk_priority = 0;
+ atomic_set(&newsk->sk_refcnt, 2);
+
+ /*
+ * Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ newsk->sk_sleep = NULL;
+
+ if (newsk->sk_prot->sockets_allocated)
+ atomic_inc(newsk->sk_prot->sockets_allocated);
+ }
+out:
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(sk_clone);
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+ __sk_dst_set(sk, dst);
+ sk->sk_route_caps = dst->dev->features;
+ if (sk->sk_route_caps & NETIF_F_GSO)
+ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+ if (sk_can_gso(sk)) {
+ if (dst->header_len) {
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ } else {
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = dst->dev->gso_max_size;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
+
+void __init sk_init(void)
+{
+ if (num_physpages <= 4096) {
+ sysctl_wmem_max = 32767;
+ sysctl_rmem_max = 32767;
+ sysctl_wmem_default = 32767;
+ sysctl_rmem_default = 32767;
+ } else if (num_physpages >= 131072) {
+ sysctl_wmem_max = 131071;
+ sysctl_rmem_max = 131071;
+ }
+}
+
+/*
+ * Simple resource managers for sockets.
+ */
+
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ */
+void sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ /* In case it might be waiting for more memory. */
+ atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
+ sk->sk_write_space(sk);
+ sock_put(sk);
+}
+
+/*
+ * Read buffer destructor automatically called from kfree_skb.
+ */
+void sock_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(skb->sk, skb->truesize);
+}
+
+
+int sock_i_uid(struct sock *sk)
+{
+ int uid;
+
+ read_lock(&sk->sk_callback_lock);
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
+ read_unlock(&sk->sk_callback_lock);
+ return uid;
+}
+
+unsigned long sock_i_ino(struct sock *sk)
+{
+ unsigned long ino;
+
+ read_lock(&sk->sk_callback_lock);
+ ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
+ read_unlock(&sk->sk_callback_lock);
+ return ino;
+}
+
+/*
+ * Allocate a skb from the socket's send buffer.
+ */
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+ gfp_t priority)
+{
+ if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ struct sk_buff * skb = alloc_skb(size, priority);
+ if (skb) {
+ skb_set_owner_w(skb, sk);
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Allocate a skb from the socket's receive buffer.
+ */
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+ gfp_t priority)
+{
+ if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
+ struct sk_buff *skb = alloc_skb(size, priority);
+ if (skb) {
+ skb_set_owner_r(skb, sk);
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Allocate a memory block from the socket's option memory buffer.
+ */
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
+{
+ if ((unsigned)size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ void *mem;
+ /* First do the add, to avoid the race if kmalloc
+ * might sleep.
+ */
+ atomic_add(size, &sk->sk_omem_alloc);
+ mem = kmalloc(size, priority);
+ if (mem)
+ return mem;
+ atomic_sub(size, &sk->sk_omem_alloc);
+ }
+ return NULL;
+}
+
+/*
+ * Free an option memory block.
+ */
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+ kfree(mem);
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
+ I think, these locks should be removed for datagram sockets.
+ */
+static long sock_wait_for_wmem(struct sock * sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ for (;;) {
+ if (!timeo)
+ break;
+ if (signal_pending(current))
+ break;
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ break;
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->sk_err)
+ break;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return timeo;
+}
+
+
+/*
+ * Generic send/receive buffer handlers
+ */
+
+static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
+ unsigned long header_len,
+ unsigned long data_len,
+ int noblock, int *errcode)
+{
+ struct sk_buff *skb;
+ gfp_t gfp_mask;
+ long timeo;
+ int err;
+
+ gfp_mask = sk->sk_allocation;
+ if (gfp_mask & __GFP_WAIT)
+ gfp_mask |= __GFP_REPEAT;
+
+ timeo = sock_sndtimeo(sk, noblock);
+ while (1) {
+ err = sock_error(sk);
+ if (err != 0)
+ goto failure;
+
+ err = -EPIPE;
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto failure;
+
+ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ skb = alloc_skb(header_len, gfp_mask);
+ if (skb) {
+ int npages;
+ int i;
+
+ /* No pages, we're done... */
+ if (!data_len)
+ break;
+
+ npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ skb->truesize += data_len;
+ skb_shinfo(skb)->nr_frags = npages;
+ for (i = 0; i < npages; i++) {
+ struct page *page;
+ skb_frag_t *frag;
+
+ page = alloc_pages(sk->sk_allocation, 0);
+ if (!page) {
+ err = -ENOBUFS;
+ skb_shinfo(skb)->nr_frags = i;
+ kfree_skb(skb);
+ goto failure;
+ }
+
+ frag = &skb_shinfo(skb)->frags[i];
+ frag->page = page;
+ frag->page_offset = 0;
+ frag->size = (data_len >= PAGE_SIZE ?
+ PAGE_SIZE :
+ data_len);
+ data_len -= PAGE_SIZE;
+ }
+
+ /* Full success... */
+ break;
+ }
+ err = -ENOBUFS;
+ goto failure;
+ }
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
+ }
+
+ skb_set_owner_w(skb, sk);
+ return skb;
+
+interrupted:
+ err = sock_intr_errno(timeo);
+failure:
+ *errcode = err;
+ return NULL;
+}
+
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+ int noblock, int *errcode)
+{
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+}
+
+static void __lock_sock(struct sock *sk)
+{
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ schedule();
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (!sock_owned_by_user(sk))
+ break;
+ }
+ finish_wait(&sk->sk_lock.wq, &wait);
+}
+
+static void __release_sock(struct sock *sk)
+{
+ struct sk_buff *skb = sk->sk_backlog.head;
+
+ do {
+ sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+ bh_unlock_sock(sk);
+
+ do {
+ struct sk_buff *next = skb->next;
+
+ skb->next = NULL;
+ sk_backlog_rcv(sk, skb);
+
+ /*
+ * We are in process context here with softirqs
+ * disabled, use cond_resched_softirq() to preempt.
+ * This is safe to do because we've taken the backlog
+ * queue private:
+ */
+ cond_resched_softirq();
+
+ skb = next;
+ } while (skb != NULL);
+
+ bh_lock_sock(sk);
+ } while ((skb = sk->sk_backlog.head) != NULL);
+}
+
+/**
+ * sk_wait_data - wait for data to arrive at sk_receive_queue
+ * @sk: sock to wait on
+ * @timeo: for how long
+ *
+ * Now socket state including sk->sk_err is changed only under lock,
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
+ */
+int sk_wait_data(struct sock *sk, long *timeo)
+{
+ int rc;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ finish_wait(sk->sk_sleep, &wait);
+ return rc;
+}
+
+EXPORT_SYMBOL(sk_wait_data);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ struct proto *prot = sk->sk_prot;
+ int amt = sk_mem_pages(size);
+ int allocated;
+
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ allocated = atomic_add_return(amt, prot->memory_allocated);
+
+ /* Under limit. */
+ if (allocated <= prot->sysctl_mem[0]) {
+ if (prot->memory_pressure && *prot->memory_pressure)
+ *prot->memory_pressure = 0;
+ return 1;
+ }
+
+ /* Under pressure. */
+ if (allocated > prot->sysctl_mem[1])
+ if (prot->enter_memory_pressure)
+ prot->enter_memory_pressure(sk);
+
+ /* Over hard limit. */
+ if (allocated > prot->sysctl_mem[2])
+ goto suppress_allocation;
+
+ /* guarantee minimum buffer size under pressure */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+ return 1;
+ } else { /* SK_MEM_SEND */
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+ return 1;
+ } else if (atomic_read(&sk->sk_wmem_alloc) <
+ prot->sysctl_wmem[0])
+ return 1;
+ }
+
+ if (prot->memory_pressure) {
+ if (!*prot->memory_pressure ||
+ prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ return 1;
+ }
+
+ /* Alas. Undo changes. */
+ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+ atomic_sub(amt, prot->memory_allocated);
+ return 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_reclaim - reclaim memory_allocated
+ * @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+ struct proto *prot = sk->sk_prot;
+
+ atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+ prot->memory_allocated);
+ sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+ if (prot->memory_pressure && *prot->memory_pressure &&
+ (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+ *prot->memory_pressure = 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
+/*
+ * Set of default routines for initialising struct proto_ops when
+ * the protocol does not support a particular function. In certain
+ * cases where it makes no sense for a protocol to have a "do nothing"
+ * function, some default processing is provided.
+ */
+
+int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+ int len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+ int *len, int peer)
+{
+ return -EOPNOTSUPP;
+}
+
+unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
+{
+ return 0;
+}
+
+int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_listen(struct socket *sock, int backlog)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_shutdown(struct socket *sock, int how)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+ size_t len)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+ size_t len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+{
+ /* Mirror missing mmap method error code */
+ return -ENODEV;
+}
+
+ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+ ssize_t res;
+ struct msghdr msg = {.msg_flags = flags};
+ struct kvec iov;
+ char *kaddr = kmap(page);
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ res = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ kunmap(page);
+ return res;
+}
+
+/*
+ * Default Socket Callbacks
+ */
+
+static void sock_def_wakeup(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_all(sk->sk_sleep);
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void sock_def_error_report(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible(sk->sk_sleep);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void sock_def_readable(struct sock *sk, int len)
+{
+ read_lock(&sk->sk_callback_lock);
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void sock_def_write_space(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void sock_def_destruct(struct sock *sk)
+{
+ kfree(sk->sk_protinfo);
+}
+
+void sk_send_sigurg(struct sock *sk)
+{
+ if (sk->sk_socket && sk->sk_socket->file)
+ if (send_sigurg(&sk->sk_socket->file->f_owner))
+ sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
+}
+
+void sk_reset_timer(struct sock *sk, struct timer_list* timer,
+ unsigned long expires)
+{
+ if (!mod_timer(timer, expires))
+ sock_hold(sk);
+}
+
+EXPORT_SYMBOL(sk_reset_timer);
+
+void sk_stop_timer(struct sock *sk, struct timer_list* timer)
+{
+ if (timer_pending(timer) && del_timer(timer))
+ __sock_put(sk);
+}
+
+EXPORT_SYMBOL(sk_stop_timer);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ skb_queue_head_init(&sk->sk_receive_queue);
+ skb_queue_head_init(&sk->sk_write_queue);
+ skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+ skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
+
+ sk->sk_send_head = NULL;
+
+ init_timer(&sk->sk_timer);
+
+ sk->sk_allocation = GFP_KERNEL;
+ sk->sk_rcvbuf = sysctl_rmem_default;
+ sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_state = TCP_CLOSE;
+ sk_set_socket(sk, sock);
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ if (sock) {
+ sk->sk_type = sock->type;
+ sk->sk_sleep = &sock->wait;
+ sock->sk = sk;
+ } else
+ sk->sk_sleep = NULL;
+
+ rwlock_init(&sk->sk_dst_lock);
+ rwlock_init(&sk->sk_callback_lock);
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
+
+ sk->sk_state_change = sock_def_wakeup;
+ sk->sk_data_ready = sock_def_readable;
+ sk->sk_write_space = sock_def_write_space;
+ sk->sk_error_report = sock_def_error_report;
+ sk->sk_destruct = sock_def_destruct;
+
+ sk->sk_sndmsg_page = NULL;
+ sk->sk_sndmsg_off = 0;
+
+ sk->sk_peercred.pid = 0;
+ sk->sk_peercred.uid = -1;
+ sk->sk_peercred.gid = -1;
+ sk->sk_write_pending = 0;
+ sk->sk_rcvlowat = 1;
+ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ sk->sk_stamp = ktime_set(-1L, 0);
+
+ atomic_set(&sk->sk_refcnt, 1);
+ atomic_set(&sk->sk_drops, 0);
+}
+
+void lock_sock_nested(struct sock *sk, int subclass)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_lock.owned)
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+ local_bh_enable();
+}
+
+EXPORT_SYMBOL(lock_sock_nested);
+
+void release_sock(struct sock *sk)
+{
+ /*
+ * The sk_lock has mutex_unlock() semantics:
+ */
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_backlog.tail)
+ __release_sock(sk);
+ sk->sk_lock.owned = 0;
+ if (waitqueue_active(&sk->sk_lock.wq))
+ wake_up(&sk->sk_lock.wq);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+EXPORT_SYMBOL(release_sock);
+
+int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+{
+ struct timeval tv;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk);
+ tv = ktime_to_timeval(sk->sk_stamp);
+ if (tv.tv_sec == -1)
+ return -ENOENT;
+ if (tv.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ tv = ktime_to_timeval(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestamp);
+
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+ struct timespec ts;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk);
+ ts = ktime_to_timespec(sk->sk_stamp);
+ if (ts.tv_sec == -1)
+ return -ENOENT;
+ if (ts.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ ts = ktime_to_timespec(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
+void sock_enable_timestamp(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_TIMESTAMP)) {
+ sock_set_flag(sk, SOCK_TIMESTAMP);
+ net_enable_timestamp();
+ }
+}
+
+/*
+ * Get a socket option on an socket.
+ *
+ * FIX: POSIX 1003.1g is very ambiguous here. It states that
+ * asynchronous errors should be reported by getsockopt. We assume
+ * this means if you specify SO_ERROR (otherwise whats the point of it).
+ */
+int sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL(sock_common_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_getsockopt != NULL)
+ return sk->sk_prot->compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
+int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *msg, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+
+EXPORT_SYMBOL(sock_common_recvmsg);
+
+/*
+ * Set socket options on an inet socket.
+ */
+int sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL(sock_common_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_setsockopt != NULL)
+ return sk->sk_prot->compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
+void sk_common_release(struct sock *sk)
+{
+ if (sk->sk_prot->destroy)
+ sk->sk_prot->destroy(sk);
+
+ /*
+ * Observation: when sock_common_release is called, processes have
+ * no access to socket. But net still has.
+ * Step one, detach it from networking:
+ *
+ * A. Remove from hash tables.
+ */
+
+ sk->sk_prot->unhash(sk);
+
+ /*
+ * In this point socket cannot receive new packets, but it is possible
+ * that some packets are in flight because some CPU runs receiver and
+ * did hash table lookup before we unhashed socket. They will achieve
+ * receive queue and will be purged by socket destructor.
+ *
+ * Also we still have packets pending on receive queue and probably,
+ * our own packets waiting in device queues. sock_destroy will drain
+ * receive queue, but transmitted packets will delay socket destruction
+ * until the last reference will be released.
+ */
+
+ sock_orphan(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+ sock_put(sk);
+}
+
+EXPORT_SYMBOL(sk_common_release);
+
+static DEFINE_RWLOCK(proto_list_lock);
+static LIST_HEAD(proto_list);
+
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR 64 /* should be enough for the first time */
+struct prot_inuse {
+ int val[PROTO_INUSE_NR];
+};
+
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+#ifdef CONFIG_NET_NS
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ int cpu = smp_processor_id();
+ per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+
+static int sock_inuse_init_net(struct net *net)
+{
+ net->core.inuse = alloc_percpu(struct prot_inuse);
+ return net->core.inuse ? 0 : -ENOMEM;
+}
+
+static void sock_inuse_exit_net(struct net *net)
+{
+ free_percpu(net->core.inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+ .init = sock_inuse_init_net,
+ .exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+ if (register_pernet_subsys(&net_inuse_ops))
+ panic("Cannot initialize net inuse counters");
+
+ return 0;
+}
+
+core_initcall(net_inuse_init);
+#else
+static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu(prot_inuse, cpu).val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+#endif
+
+static void assign_proto_idx(struct proto *prot)
+{
+ prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
+ return;
+ }
+
+ set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+ if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline void assign_proto_idx(struct proto *prot)
+{
+}
+
+static inline void release_proto_idx(struct proto *prot)
+{
+}
+#endif
+
+int proto_register(struct proto *prot, int alloc_slab)
+{
+ if (alloc_slab) {
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+
+ if (prot->slab == NULL) {
+ printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
+ prot->name);
+ goto out;
+ }
+
+ if (prot->rsk_prot != NULL) {
+ static const char mask[] = "request_sock_%s";
+
+ prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+ if (prot->rsk_prot->slab_name == NULL)
+ goto out_free_sock_slab;
+
+ sprintf(prot->rsk_prot->slab_name, mask, prot->name);
+ prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
+ prot->rsk_prot->obj_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+
+ if (prot->rsk_prot->slab == NULL) {
+ printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+ prot->name);
+ goto out_free_request_sock_slab_name;
+ }
+ }
+
+ if (prot->twsk_prot != NULL) {
+ static const char mask[] = "tw_sock_%s";
+
+ prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+
+ if (prot->twsk_prot->twsk_slab_name == NULL)
+ goto out_free_request_sock_slab;
+
+ sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
+ prot->twsk_prot->twsk_slab =
+ kmem_cache_create(prot->twsk_prot->twsk_slab_name,
+ prot->twsk_prot->twsk_obj_size,
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (prot->twsk_prot->twsk_slab == NULL)
+ goto out_free_timewait_sock_slab_name;
+ }
+ }
+
+ write_lock(&proto_list_lock);
+ list_add(&prot->node, &proto_list);
+ assign_proto_idx(prot);
+ write_unlock(&proto_list_lock);
+ return 0;
+
+out_free_timewait_sock_slab_name:
+ kfree(prot->twsk_prot->twsk_slab_name);
+out_free_request_sock_slab:
+ if (prot->rsk_prot && prot->rsk_prot->slab) {
+ kmem_cache_destroy(prot->rsk_prot->slab);
+ prot->rsk_prot->slab = NULL;
+ }
+out_free_request_sock_slab_name:
+ kfree(prot->rsk_prot->slab_name);
+out_free_sock_slab:
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+out:
+ return -ENOBUFS;
+}
+
+EXPORT_SYMBOL(proto_register);
+
+void proto_unregister(struct proto *prot)
+{
+ write_lock(&proto_list_lock);
+ release_proto_idx(prot);
+ list_del(&prot->node);
+ write_unlock(&proto_list_lock);
+
+ if (prot->slab != NULL) {
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
+
+ if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
+ kmem_cache_destroy(prot->rsk_prot->slab);
+ kfree(prot->rsk_prot->slab_name);
+ prot->rsk_prot->slab = NULL;
+ }
+
+ if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+ kmem_cache_destroy(prot->twsk_prot->twsk_slab);
+ kfree(prot->twsk_prot->twsk_slab_name);
+ prot->twsk_prot->twsk_slab = NULL;
+ }
+}
+
+EXPORT_SYMBOL(proto_unregister);
+
+#ifdef CONFIG_PROC_FS
+static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(proto_list_lock)
+{
+ read_lock(&proto_list_lock);
+ return seq_list_start_head(&proto_list, *pos);
+}
+
+static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &proto_list, pos);
+}
+
+static void proto_seq_stop(struct seq_file *seq, void *v)
+ __releases(proto_list_lock)
+{
+ read_unlock(&proto_list_lock);
+}
+
+static char proto_method_implemented(const void *method)
+{
+ return method == NULL ? 'n' : 'y';
+}
+
+static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
+{
+ seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
+ "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+ proto->name,
+ proto->obj_size,
+ proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
+ proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
+ proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ proto->max_header,
+ proto->slab == NULL ? "no" : "yes",
+ module_name(proto->owner),
+ proto_method_implemented(proto->close),
+ proto_method_implemented(proto->connect),
+ proto_method_implemented(proto->disconnect),
+ proto_method_implemented(proto->accept),
+ proto_method_implemented(proto->ioctl),
+ proto_method_implemented(proto->init),
+ proto_method_implemented(proto->destroy),
+ proto_method_implemented(proto->shutdown),
+ proto_method_implemented(proto->setsockopt),
+ proto_method_implemented(proto->getsockopt),
+ proto_method_implemented(proto->sendmsg),
+ proto_method_implemented(proto->recvmsg),
+ proto_method_implemented(proto->sendpage),
+ proto_method_implemented(proto->bind),
+ proto_method_implemented(proto->backlog_rcv),
+ proto_method_implemented(proto->hash),
+ proto_method_implemented(proto->unhash),
+ proto_method_implemented(proto->get_port),
+ proto_method_implemented(proto->enter_memory_pressure));
+}
+
+static int proto_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == &proto_list)
+ seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
+ "protocol",
+ "size",
+ "sockets",
+ "memory",
+ "press",
+ "maxhdr",
+ "slab",
+ "module",
+ "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+ else
+ proto_seq_printf(seq, list_entry(v, struct proto, node));
+ return 0;
+}
+
+static const struct seq_operations proto_seq_ops = {
+ .start = proto_seq_start,
+ .next = proto_seq_next,
+ .stop = proto_seq_stop,
+ .show = proto_seq_show,
+};
+
+static int proto_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &proto_seq_ops);
+}
+
+static const struct file_operations proto_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = proto_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proto_init(void)
+{
+ /* register /proc/net/protocols */
+ return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+}
+
+subsys_initcall(proto_init);
+
+#endif /* PROC_FS */
+
+EXPORT_SYMBOL(sk_alloc);
+EXPORT_SYMBOL(sk_free);
+EXPORT_SYMBOL(sk_send_sigurg);
+EXPORT_SYMBOL(sock_alloc_send_skb);
+EXPORT_SYMBOL(sock_init_data);
+EXPORT_SYMBOL(sock_kfree_s);
+EXPORT_SYMBOL(sock_kmalloc);
+EXPORT_SYMBOL(sock_no_accept);
+EXPORT_SYMBOL(sock_no_bind);
+EXPORT_SYMBOL(sock_no_connect);
+EXPORT_SYMBOL(sock_no_getname);
+EXPORT_SYMBOL(sock_no_getsockopt);
+EXPORT_SYMBOL(sock_no_ioctl);
+EXPORT_SYMBOL(sock_no_listen);
+EXPORT_SYMBOL(sock_no_mmap);
+EXPORT_SYMBOL(sock_no_poll);
+EXPORT_SYMBOL(sock_no_recvmsg);
+EXPORT_SYMBOL(sock_no_sendmsg);
+EXPORT_SYMBOL(sock_no_sendpage);
+EXPORT_SYMBOL(sock_no_setsockopt);
+EXPORT_SYMBOL(sock_no_shutdown);
+EXPORT_SYMBOL(sock_no_socketpair);
+EXPORT_SYMBOL(sock_rfree);
+EXPORT_SYMBOL(sock_setsockopt);
+EXPORT_SYMBOL(sock_wfree);
+EXPORT_SYMBOL(sock_wmalloc);
+EXPORT_SYMBOL(sock_i_uid);
+EXPORT_SYMBOL(sock_i_ino);
+EXPORT_SYMBOL(sysctl_optmem_max);
diff --git a/net/core/stream.c b/net/core/stream.c
new file mode 100644
index 0000000..8727cea
--- /dev/null
+++ b/net/core/stream.c
@@ -0,0 +1,209 @@
+/*
+ * SUCS NET3:
+ *
+ * Generic stream handling routines. These are generic for most
+ * protocols. Even IP. Tonight 8-).
+ * This is used because TCP, LLC (others too) layer all have mostly
+ * identical sendmsg() and recvmsg() code.
+ * So we (will) share it here.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * (from old tcp.c code)
+ * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/signal.h>
+#include <linux/tcp.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+
+/**
+ * sk_stream_write_space - stream socket write_space callback.
+ * @sk: socket
+ *
+ * FIXME: write proper description
+ */
+void sk_stream_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible(sk->sk_sleep);
+ if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+}
+
+EXPORT_SYMBOL(sk_stream_write_space);
+
+/**
+ * sk_stream_wait_connect - Wait for a socket to get into the connected state
+ * @sk: sock to wait on
+ * @timeo_p: for how long to wait
+ *
+ * Must be called with the socket locked.
+ */
+int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
+{
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ int done;
+
+ do {
+ int err = sock_error(sk);
+ if (err)
+ return err;
+ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
+ return -EPIPE;
+ if (!*timeo_p)
+ return -EAGAIN;
+ if (signal_pending(tsk))
+ return sock_intr_errno(*timeo_p);
+
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending++;
+ done = sk_wait_event(sk, timeo_p,
+ !sk->sk_err &&
+ !((1 << sk->sk_state) &
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
+ finish_wait(sk->sk_sleep, &wait);
+ sk->sk_write_pending--;
+ } while (!done);
+ return 0;
+}
+
+EXPORT_SYMBOL(sk_stream_wait_connect);
+
+/**
+ * sk_stream_closing - Return 1 if we still have things to send in our buffers.
+ * @sk: socket to verify
+ */
+static inline int sk_stream_closing(struct sock *sk)
+{
+ return (1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
+}
+
+void sk_stream_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk->sk_sleep, &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk->sk_sleep, &wait);
+ }
+}
+
+EXPORT_SYMBOL(sk_stream_wait_close);
+
+/**
+ * sk_stream_wait_memory - Wait for more memory for a socket
+ * @sk: socket to wait for memory
+ * @timeo_p: for how long
+ */
+int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+{
+ int err = 0;
+ long vm_wait = 0;
+ long current_timeo = *timeo_p;
+ DEFINE_WAIT(wait);
+
+ if (sk_stream_memory_free(sk))
+ current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
+
+ while (1) {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+ if (!*timeo_p)
+ goto do_nonblock;
+ if (signal_pending(current))
+ goto do_interrupted;
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ if (sk_stream_memory_free(sk) && !vm_wait)
+ break;
+
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_pending++;
+ sk_wait_event(sk, &current_timeo, !sk->sk_err &&
+ !(sk->sk_shutdown & SEND_SHUTDOWN) &&
+ sk_stream_memory_free(sk) &&
+ vm_wait);
+ sk->sk_write_pending--;
+
+ if (vm_wait) {
+ vm_wait -= current_timeo;
+ current_timeo = *timeo_p;
+ if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
+ (current_timeo -= vm_wait) < 0)
+ current_timeo = 0;
+ vm_wait = 0;
+ }
+ *timeo_p = current_timeo;
+ }
+out:
+ finish_wait(sk->sk_sleep, &wait);
+ return err;
+
+do_error:
+ err = -EPIPE;
+ goto out;
+do_nonblock:
+ err = -EAGAIN;
+ goto out;
+do_interrupted:
+ err = sock_intr_errno(*timeo_p);
+ goto out;
+}
+
+EXPORT_SYMBOL(sk_stream_wait_memory);
+
+int sk_stream_error(struct sock *sk, int flags, int err)
+{
+ if (err == -EPIPE)
+ err = sock_error(sk) ? : -EPIPE;
+ if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ return err;
+}
+
+EXPORT_SYMBOL(sk_stream_error);
+
+void sk_stream_kill_queues(struct sock *sk)
+{
+ /* First the read buffer. */
+ __skb_queue_purge(&sk->sk_receive_queue);
+
+ /* Next, the error queue. */
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ /* Next, the write queue. */
+ WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+
+ /* Account for returned memory. */
+ sk_mem_reclaim(sk);
+
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+
+ /* It is _impossible_ for the backlog to contain anything
+ * when we get here. All user references to this socket
+ * have gone away, only the net layer knows can touch it.
+ */
+}
+
+EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
new file mode 100644
index 0000000..f686467
--- /dev/null
+++ b/net/core/sysctl_net_core.c
@@ -0,0 +1,214 @@
+/* -*- linux-c -*-
+ * sysctl_net_core.c: sysctl interface to net core subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/core directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+
+static struct ctl_table net_core_table[] = {
+#ifdef CONFIG_NET
+ {
+ .ctl_name = NET_CORE_WMEM_MAX,
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_RMEM_MAX,
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_WMEM_DEFAULT,
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_RMEM_DEFAULT,
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_DEV_WEIGHT,
+ .procname = "dev_weight",
+ .data = &weight_p,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_MAX_BACKLOG,
+ .procname = "netdev_max_backlog",
+ .data = &netdev_max_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_MSG_COST,
+ .procname = "message_cost",
+ .data = &net_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_CORE_MSG_BURST,
+ .procname = "message_burst",
+ .data = &net_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_CORE_OPTMEM_MAX,
+ .procname = "optmem_max",
+ .data = &sysctl_optmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+#ifdef CONFIG_XFRM
+ {
+ .ctl_name = NET_CORE_AEVENT_ETIME,
+ .procname = "xfrm_aevent_etime",
+ .data = &sysctl_xfrm_aevent_etime,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_AEVENT_RSEQTH,
+ .procname = "xfrm_aevent_rseqth",
+ .data = &sysctl_xfrm_aevent_rseqth,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "xfrm_larval_drop",
+ .data = &sysctl_xfrm_larval_drop,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "xfrm_acq_expires",
+ .data = &sysctl_xfrm_acq_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+#endif /* CONFIG_XFRM */
+#endif /* CONFIG_NET */
+ {
+ .ctl_name = NET_CORE_BUDGET,
+ .procname = "netdev_budget",
+ .data = &netdev_budget,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_CORE_WARNINGS,
+ .procname = "warnings",
+ .data = &net_msg_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table netns_core_table[] = {
+ {
+ .ctl_name = NET_CORE_SOMAXCONN,
+ .procname = "somaxconn",
+ .data = &init_net.core.sysctl_somaxconn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ { .ctl_name = 0 }
+};
+
+static __net_initdata struct ctl_path net_core_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "core", .ctl_name = NET_CORE, },
+ { },
+};
+
+static __net_init int sysctl_core_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ net->core.sysctl_somaxconn = SOMAXCONN;
+
+ tbl = netns_core_table;
+ if (net != &init_net) {
+ tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+
+ tbl[0].data = &net->core.sysctl_somaxconn;
+ }
+
+ net->core.sysctl_hdr = register_net_sysctl_table(net,
+ net_core_path, tbl);
+ if (net->core.sysctl_hdr == NULL)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (tbl != netns_core_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_core_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->core.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->core.sysctl_hdr);
+ BUG_ON(tbl == netns_core_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_core_ops = {
+ .init = sysctl_core_net_init,
+ .exit = sysctl_core_net_exit,
+};
+
+static __init int sysctl_core_init(void)
+{
+ register_net_sysctl_rotable(net_core_path, net_core_table);
+ return register_pernet_subsys(&sysctl_core_ops);
+}
+
+__initcall(sysctl_core_init);
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 0000000..164b090
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <net/tcp.h>
+#include <net/netdma.h>
+
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
+
+/**
+ * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * @skb - buffer to copy
+ * @offset - offset in the buffer to start copying from
+ * @iovec - io vector to copy to
+ * @len - amount of data to copy from buffer to iovec
+ * @pinned_list - locked iovec buffer data
+ *
+ * Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+ struct sk_buff *skb, int offset, struct iovec *to,
+ size_t len, struct dma_pinned_list *pinned_list)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ dma_cookie_t cookie = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+ skb->data + offset, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ copy = end - offset;
+ if (copy > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+
+ cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+ frag->page_offset + offset - start, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ copy = end - offset;
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_skb_copy_datagram_iovec(chan, list,
+ offset - start, to, copy,
+ pinned_list);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+ start = end;
+ }
+ }
+
+end:
+ if (!len) {
+ skb->dma_cookie = cookie;
+ return cookie;
+ }
+
+fault:
+ return -EFAULT;
+}
diff --git a/net/core/utils.c b/net/core/utils.c
new file mode 100644
index 0000000..72e0ebe
--- /dev/null
+++ b/net/core/utils.c
@@ -0,0 +1,299 @@
+/*
+ * Generic address resultion entity
+ *
+ * Authors:
+ * net_random Alan Cox
+ * net_ratelimit Andi Kleen
+ * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
+ *
+ * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/inet.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <net/sock.h>
+
+#include <asm/byteorder.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+int net_msg_warn __read_mostly = 1;
+EXPORT_SYMBOL(net_msg_warn);
+
+DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
+/*
+ * All net warning printk()s should be guarded by this function.
+ */
+int net_ratelimit(void)
+{
+ return __ratelimit(&net_ratelimit_state);
+}
+EXPORT_SYMBOL(net_ratelimit);
+
+/*
+ * Convert an ASCII string to binary IP.
+ * This is outside of net/ipv4/ because various code that uses IP addresses
+ * is otherwise not dependent on the TCP/IP stack.
+ */
+
+__be32 in_aton(const char *str)
+{
+ unsigned long l;
+ unsigned int val;
+ int i;
+
+ l = 0;
+ for (i = 0; i < 4; i++)
+ {
+ l <<= 8;
+ if (*str != '\0')
+ {
+ val = 0;
+ while (*str != '\0' && *str != '.' && *str != '\n')
+ {
+ val *= 10;
+ val += *str - '0';
+ str++;
+ }
+ l |= val;
+ if (*str != '\0')
+ str++;
+ }
+ }
+ return(htonl(l));
+}
+
+EXPORT_SYMBOL(in_aton);
+
+#define IN6PTON_XDIGIT 0x00010000
+#define IN6PTON_DIGIT 0x00020000
+#define IN6PTON_COLON_MASK 0x00700000
+#define IN6PTON_COLON_1 0x00100000 /* single : requested */
+#define IN6PTON_COLON_2 0x00200000 /* second : requested */
+#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */
+#define IN6PTON_DOT 0x00800000 /* . */
+#define IN6PTON_DELIM 0x10000000
+#define IN6PTON_NULL 0x20000000 /* first/tail */
+#define IN6PTON_UNKNOWN 0x40000000
+
+static inline int xdigit2bin(char c, int delim)
+{
+ if (c == delim || c == '\0')
+ return IN6PTON_DELIM;
+ if (c == ':')
+ return IN6PTON_COLON_MASK;
+ if (c == '.')
+ return IN6PTON_DOT;
+ if (c >= '0' && c <= '9')
+ return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0'));
+ if (c >= 'a' && c <= 'f')
+ return (IN6PTON_XDIGIT | (c - 'a' + 10));
+ if (c >= 'A' && c <= 'F')
+ return (IN6PTON_XDIGIT | (c - 'A' + 10));
+ if (delim == -1)
+ return IN6PTON_DELIM;
+ return IN6PTON_UNKNOWN;
+}
+
+int in4_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s;
+ u8 *d;
+ u8 dbuf[4];
+ int ret = 0;
+ int i;
+ int w = 0;
+
+ if (srclen < 0)
+ srclen = strlen(src);
+ s = src;
+ d = dbuf;
+ i = 0;
+ while(1) {
+ int c;
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
+ goto out;
+ }
+ if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (w == 0)
+ goto out;
+ *d++ = w & 0xff;
+ w = 0;
+ i++;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (i != 4)
+ goto out;
+ break;
+ }
+ goto cont;
+ }
+ w = (w * 10) + c;
+ if ((w & 0xffff) > 255) {
+ goto out;
+ }
+cont:
+ if (i >= 4)
+ goto out;
+ s++;
+ srclen--;
+ }
+ ret = 1;
+ memcpy(dst, dbuf, sizeof(dbuf));
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+
+EXPORT_SYMBOL(in4_pton);
+
+int in6_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s, *tok = NULL;
+ u8 *d, *dc = NULL;
+ u8 dbuf[16];
+ int ret = 0;
+ int i;
+ int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
+ int w = 0;
+
+ memset(dbuf, 0, sizeof(dbuf));
+
+ s = src;
+ d = dbuf;
+ if (srclen < 0)
+ srclen = strlen(src);
+
+ while (1) {
+ int c;
+
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & state))
+ goto out;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ /* process one 16-bit word */
+ if (!(state & IN6PTON_NULL)) {
+ *d++ = (w >> 8) & 0xff;
+ *d++ = w & 0xff;
+ }
+ w = 0;
+ if (c & IN6PTON_DELIM) {
+ /* We've processed last word */
+ break;
+ }
+ /*
+ * COLON_1 => XDIGIT
+ * COLON_2 => XDIGIT|DELIM
+ * COLON_1_2 => COLON_2
+ */
+ switch (state & IN6PTON_COLON_MASK) {
+ case IN6PTON_COLON_2:
+ dc = d;
+ state = IN6PTON_XDIGIT | IN6PTON_DELIM;
+ if (dc - dbuf >= sizeof(dbuf))
+ state |= IN6PTON_NULL;
+ break;
+ case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
+ state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
+ break;
+ case IN6PTON_COLON_1:
+ state = IN6PTON_XDIGIT;
+ break;
+ case IN6PTON_COLON_1_2:
+ state = IN6PTON_COLON_2;
+ break;
+ default:
+ state = 0;
+ }
+ tok = s + 1;
+ goto cont;
+ }
+
+ if (c & IN6PTON_DOT) {
+ ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
+ if (ret > 0) {
+ d += 4;
+ break;
+ }
+ goto out;
+ }
+
+ w = (w << 4) | (0xff & c);
+ state = IN6PTON_COLON_1 | IN6PTON_DELIM;
+ if (!(w & 0xf000)) {
+ state |= IN6PTON_XDIGIT;
+ }
+ if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_COLON_1_2;
+ state &= ~IN6PTON_DELIM;
+ }
+ if (d + 2 >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
+ }
+cont:
+ if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
+ d + 4 == dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_DOT;
+ }
+ if (d >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
+ }
+ s++;
+ srclen--;
+ }
+
+ i = 15; d--;
+
+ if (dc) {
+ while(d >= dc)
+ dst[i--] = *d--;
+ while(i >= dc - dbuf)
+ dst[i--] = 0;
+ while(i >= 0)
+ dst[i--] = *d--;
+ } else
+ memcpy(dst, dbuf, sizeof(dbuf));
+
+ ret = 1;
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+
+EXPORT_SYMBOL(in6_pton);
+
+void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, int pseudohdr)
+{
+ __be32 diff[] = { ~from, to };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace4);
OpenPOWER on IntegriCloud