From db21733488f84a596faaad0d05430b3f51804692 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Sat, 17 Jun 2006 21:24:58 -0700 Subject: [I/OAT]: Setup the networking subsystem as a DMA client Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/core/dev.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4fba549..6bfa78c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,6 +115,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtnl * semaphore. @@ -1846,6 +1853,19 @@ static void net_rx_action(struct softirq_action *h) } } out: +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3300,6 +3320,88 @@ static int dev_cpu_callback(struct notifier_block *nfb, } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n; + struct dma_chan *chan; + + lock_cpu_hotplug(); + + if (net_dma_count == 0) { + for_each_online_cpu(cpu) + rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL); + unlock_cpu_hotplug(); + return; + } + + i = 0; + cpu = first_cpu(cpu_online_map); + + rcu_read_lock(); + list_for_each_entry(chan, &net_dma_client->channels, client_node) { + n = ((num_online_cpus() / net_dma_count) + + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); + + while(n) { + per_cpu(softnet_data.net_dma, cpu) = chan; + cpu = next_cpu(cpu, cpu_online_map); + n--; + } + i++; + } + rcu_read_unlock(); + + unlock_cpu_hotplug(); +} + +/** + * netdev_dma_event - event callback for the net_dma_client + * @client: should always be net_dma_client + * @chan: + * @event: + */ +static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, + enum dma_event event) +{ + spin_lock(&net_dma_event_lock); + switch (event) { + case DMA_RESOURCE_ADDED: + net_dma_count++; + net_dma_rebalance(); + break; + case DMA_RESOURCE_REMOVED: + net_dma_count--; + net_dma_rebalance(); + break; + default: + break; + } + spin_unlock(&net_dma_event_lock); +} + +/** + * netdev_dma_regiser - register the networking subsystem as a DMA client + */ +static int __init netdev_dma_register(void) +{ + spin_lock_init(&net_dma_event_lock); + net_dma_client = dma_async_client_register(netdev_dma_event); + if (net_dma_client == NULL) + return -ENOMEM; + + dma_async_client_chan_request(net_dma_client, num_online_cpus()); + return 0; +} + +#else +static int __init netdev_dma_register(void) { return -ENODEV; } +#endif /* CONFIG_NET_DMA */ /* * Initialize the DEV module. At boot time this walks the device list and @@ -3353,6 +3455,8 @@ static int __init net_dev_init(void) atomic_set(&queue->backlog_dev.refcnt, 1); } + netdev_dma_register(); + dev_boot_phase = 0; open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); -- cgit v1.1 From de5506e155276d385712c2aa1c2d9a27cd4ed947 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 17:50:37 -0700 Subject: [I/OAT]: Utility functions for offloading sk_buff to iovec copies Provides for pinning user space pages in memory, copying to iovecs, and copying from sk_buffs including fragmented and chained sk_buffs. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/core/Makefile | 1 + net/core/user_dma.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 net/core/user_dma.c (limited to 'net') diff --git a/net/core/Makefile b/net/core/Makefile index 79fe12c..e9bd246 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_WIRELESS_EXT) += wireless.o obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_NET_DMA) += user_dma.o diff --git a/net/core/user_dma.c b/net/core/user_dma.c new file mode 100644 index 0000000..9eee91b --- /dev/null +++ b/net/core/user_dma.c @@ -0,0 +1,127 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include +#include +#include /* for BUG_TRAP */ +#include + +/** + * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * @pinned_list - locked iovec buffer data + * + * Note: the iovec is modified during the copy. + */ +int dma_skb_copy_datagram_iovec(struct dma_chan *chan, + struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + dma_cookie_t cookie = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_memcpy_to_iovec(chan, to, pinned_list, + skb->data + offset, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + copy = end - offset; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + + cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, + frag->page_offset + offset - start, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + copy = end - offset; + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_skb_copy_datagram_iovec(chan, list, + offset - start, to, copy, + pinned_list); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + } + +end: + if (!len) { + skb->dma_cookie = cookie; + return cookie; + } + +fault: + return -EFAULT; +} -- cgit v1.1 From 97fc2f0848c928c63c2ae619deee61a0b1107b69 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 17:55:33 -0700 Subject: [I/OAT]: Structure changes for TCP recv offload to I/OAT Adds an async_wait_queue and some additional fields to tcp_sock, and a dma_cookie_t to sk_buff. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index ed2afdb..5d820c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif rwlock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&sk->sk_async_wait_queue); +#endif sk->sk_send_head = NULL; -- cgit v1.1 From 0e4b4992b8007c6b62ec143cbbb292f98813ca11 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:00:16 -0700 Subject: [I/OAT]: Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b80..1c0cfd7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ @@ -1391,7 +1391,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } -- cgit v1.1 From 624d1164730d58a494cc5aa4afa37d02c41e83a7 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:01:28 -0700 Subject: [I/OAT]: Make sk_eat_skb I/OAT aware. Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/dccp/proto.c | 4 ++-- net/ipv4/tcp.c | 8 ++++---- net/llc/af_llc.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e0ee83..5317fd3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1c0cfd7..4e067d25 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } if (skb->h.th->fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc->count) break; } @@ -1356,14 +1356,14 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (len > 0); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db7..7465170 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, continue; if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len > 0); -- cgit v1.1 From 9593782585e0cf70babe787a8463d492a68b1744 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:02:55 -0700 Subject: [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/core/user_dma.c | 4 ++++ net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 9eee91b..b7c98db 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -30,6 +30,10 @@ #include /* for BUG_TRAP */ #include +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + /** * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb - buffer to copy diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3ad..6a6aa53 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = "tcp_dma_copybreak", + .data = &sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif { .ctl_name = 0 } }; -- cgit v1.1 From 1a2449a87bb7606113b1aa1a9d3c3e78ef189a1c Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:05:53 -0700 Subject: [I/OAT]: TCP recv offload to I/OAT Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 103 +++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_input.c | 74 ++++++++++++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 18 ++++++++- net/ipv6/tcp_ipv6.c | 12 +++++- 4 files changed, 185 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e067d25..ff6ccda 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -263,7 +263,7 @@ #include #include #include - +#include #include #include @@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + preempt_disable(); + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); + } else + preempt_enable_no_resched(); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1329,13 +1345,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1355,15 +1397,19 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } break; } while (len > 0); @@ -1386,6 +1432,36 @@ skip_copy: tp->ucopy.len = 0; } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) { + struct sk_buff *skb; + dma_cookie_t done, used; + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + while (dma_async_memcpy_complete(tp->ucopy.dma_chan, + tp->ucopy.dma_cookie, &done, + &used) == DMA_IN_PROGRESS) { + /* do partial cleanup of sk_async_wait_queue */ + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + dma_chan_put(tp->ucopy.dma_chan); + tp->ucopy.dma_chan = NULL; + } + if (tp->ucopy.pinned_list) { + dma_unpin_iovec_pages(tp->ucopy.pinned_list); + tp->ucopy.pinned_list = NULL; + } +#endif + /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -1658,6 +1734,9 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); sk_stream_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif inet->dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b5521a9..c6d62f0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -71,6 +71,7 @@ #include #include #include +#include int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; @@ -3785,6 +3786,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk __tcp_checksum_complete_user(sk, skb); } +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + /* * TCP receive function for the ESTABLISHED state. * @@ -3901,14 +3946,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; + int copied_early = 0; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -3924,8 +3978,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -3966,6 +4021,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __tcp_ack_snd_check(sk, 0); no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif if (eaten) __kfree_skb(skb); else diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 672950e..25ecc6e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -1091,8 +1092,18 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + /* Cleans up our sk_async_wait_queue */ + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 301eee7..a50eb30 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1218,8 +1218,16 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (tp->ucopy.dma_chan) + ret = tcp_v6_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); -- cgit v1.1 From aecbd4e45c2e469e0452ffb2c0b0d881e2815bb8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:08:30 -0700 Subject: [LLC]: use more efficient ether address routines Use more cache efficient Ethernet address manipulation functions in etherdevice.h. Signed-off-by: Stephen Hemminger --- net/llc/llc_if.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index ba90f7f..5ae47be 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -26,8 +26,6 @@ #include #include -u8 llc_mac_null_var[IFHWADDRLEN]; - /** * llc_build_and_send_pkt - Connection data sending for upper layers. * @sk: connection -- cgit v1.1 From 29efcd2666b3a465b40aa07ef1f4d79847303e2f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:08:59 -0700 Subject: [LLC]: allow datagram recvmsg LLC receive is broken for SOCK_DGRAM. If an application does recv() on a datagram socket and there is no data present, don't return "not connected". Instead, just do normal datagram semantics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/af_llc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 7465170..75c9b14 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -674,7 +674,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, lock_sock(sk); copied = -ENOTCONN; - if (sk->sk_state == TCP_LISTEN) + if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); @@ -733,7 +733,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read -- cgit v1.1 From 23dbe7912dad6be71bb9e69cb819d05e2442d362 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:09:37 -0700 Subject: [LLC]: use rcu_dereference on receive handler The receive hander pointer might be modified during network changes of protocol. So use rcu_dereference (only matters on alpha). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/llc_input.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index d62e0f9..cb9f7f0 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -142,6 +142,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; + int (*rcv)(struct sk_buff *, struct net_device *, + struct packet_type *, struct net_device *); /* * When the interface is in promisc. mode, drop all the crap that it @@ -169,8 +171,9 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, * First the upper layer protocols that don't need the full * LLC functionality */ - if (sap->rcv_func) { - sap->rcv_func(skb, dev, pt, orig_dev); + rcv = rcu_dereference(sap->rcv_func); + if (rcv) { + rcv(skb, dev, pt, orig_dev); goto out_put; } dest = llc_pdu_type(skb); -- cgit v1.1 From 8f182b494f87799d6ae20a1401825c516da46081 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:10:02 -0700 Subject: [LLC]: allow applications to get copy of kernel datagrams It is legal for an application to bind to a SAP that is also being used by the kernel. This happens if the bridge module binds to the STP SAP, and the user wants to have a daemon for STP as well. It is possible to have kernel doing STP on one bridge, but let application do RSTP on another bridge. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/llc_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index cb9f7f0..32cf8ac 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -173,8 +173,10 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); if (rcv) { + struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); + if (cskb) + rcv(cskb, dev, pt, orig_dev); rcv(skb, dev, pt, orig_dev); - goto out_put; } dest = llc_pdu_type(skb); if (unlikely(!dest || !llc_type_handlers[dest - 1])) -- cgit v1.1 From bc0e646796928918e45b6465e02616f2fe65c3c1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:10:37 -0700 Subject: [LLC]: add multicast support for datagrams Allow mulitcast reception of datagrams (similar to UDP). All sockets bound to the same SAP receive a clone. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/llc_sap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 4029cee..20c4eb5 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb) * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, - struct llc_addr *laddr) + const struct llc_addr *laddr) { struct sock *rc; struct hlist_node *node; @@ -304,19 +304,62 @@ found: return rc; } +/** + * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search socket list of the SAP and finds connections with same sap. + * Deliver clone to each. + */ +static void llc_sap_mcast(struct llc_sap *sap, + const struct llc_addr *laddr, + struct sk_buff *skb) +{ + struct sock *sk; + struct hlist_node *node; + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(sk, node, &sap->sk_list.list) { + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *skb1; + + if (sk->sk_type != SOCK_DGRAM) + continue; + + if (llc->laddr.lsap != laddr->lsap) + continue; + + skb1 = skb_clone(skb, GFP_ATOMIC); + if (!skb1) + break; + + sock_hold(sk); + skb_set_owner_r(skb1, sk); + llc_sap_rcv(sap, skb1); + sock_put(sk); + } + read_unlock_bh(&sap->sk_list.lock); +} + + void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; - struct sock *sk; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); - sk = llc_lookup_dgram(sap, &laddr); - if (sk) { - skb_set_owner_r(skb, sk); - llc_sap_rcv(sap, skb); - sock_put(sk); - } else + if (llc_mac_multicast(laddr.mac)) { + llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); + } else { + struct sock *sk = llc_lookup_dgram(sap, &laddr); + if (sk) { + skb_set_owner_r(skb, sk); + llc_sap_rcv(sap, skb); + sock_put(sk); + } else + kfree_skb(skb); + } } -- cgit v1.1 From 9ef513bed68534110727381ab652f06756803f5a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:58:54 -0700 Subject: [BRIDGE]: optimize conditional in forward path Small optimizations of bridge forwarding path. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 56f3aa47..0dca027 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -20,14 +20,11 @@ #include #include "br_private.h" +/* Don't forward packets to originating port or forwarding diasabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { - if (skb->dev == p->dev || - p->state != BR_STATE_FORWARDING) - return 0; - - return 1; + return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING); } static inline unsigned packet_length(const struct sk_buff *skb) @@ -55,10 +52,9 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) int br_forward_finish(struct sk_buff *skb) { - NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, - br_dev_queue_push_xmit); + return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, + br_dev_queue_push_xmit); - return 0; } static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) -- cgit v1.1 From c090971326db094ed702c1f8f2dbe04b7e3b8f27 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:59:33 -0700 Subject: [BRIDGE]: fix module startup error handling Return address in use, if some other kernel code has the SAP. Propogate out error codes from netfilter registration and unwind. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br.c | 26 +++++++++++++++++--------- net/bridge/br_private.h | 5 +++++ 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 12da21a..558d2720 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -30,36 +30,44 @@ static struct llc_sap *br_stp_sap; static int __init br_init(void) { + int err; + br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv); if (!br_stp_sap) { printk(KERN_ERR "bridge: can't register sap for STP\n"); - return -EBUSY; + return -EADDRINUSE; } br_fdb_init(); -#ifdef CONFIG_BRIDGE_NETFILTER - if (br_netfilter_init()) - return 1; -#endif + err = br_netfilter_init(); + if (err) + goto err_out1; + + err = register_netdevice_notifier(&br_device_notifier); + if (err) + goto err_out2; + brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; br_fdb_get_hook = br_fdb_get; br_fdb_put_hook = br_fdb_put; - register_netdevice_notifier(&br_device_notifier); - return 0; + +err_out2: + br_netfilter_fini(); +err_out1: + llc_sap_put(br_stp_sap); + return err; } static void __exit br_deinit(void) { rcu_assign_pointer(br_stp_sap->rcv_func, NULL); -#ifdef CONFIG_BRIDGE_NETFILTER br_netfilter_fini(); -#endif unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ecea7..22071d1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -192,8 +192,13 @@ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); /* br_netfilter.c */ +#ifdef CONFIG_BRIDGE_NETFILTER extern int br_netfilter_init(void); extern void br_netfilter_fini(void); +#else +#define br_netfilter_init() (0) +#define br_netfilter_fini() do { } while(0) +#endif /* br_stp.c */ extern void br_log_state(const struct net_bridge_port *p); -- cgit v1.1 From 11dc1f36a6701b502ecb695f308aae46ede8bac6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 16:00:12 -0700 Subject: [BRIDGE]: netlink interface for link management Add basic netlink support to the Ethernet bridge. Including: * dump interfaces in bridges * monitor link status changes * change state of bridge port For some demo programs see: http://developer.osdl.org/shemminger/prototypes/brnl.tar.gz These are to allow building a daemon that does alternative implementations of Spanning Tree Protocol. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 +- net/bridge/br.c | 2 + net/bridge/br_netlink.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_notify.c | 2 + net/bridge/br_private.h | 7 +- net/bridge/br_stp_if.c | 4 + 6 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 net/bridge/br_netlink.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 59556e4..f444c12 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o + br_stp_if.o br_stp_timer.o br_netlink.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 558d2720..654401c 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -48,6 +48,7 @@ static int __init br_init(void) if (err) goto err_out2; + br_netlink_init(); brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; @@ -67,6 +68,7 @@ static void __exit br_deinit(void) { rcu_assign_pointer(br_stp_sap->rcv_func, NULL); + br_netlink_fini(); br_netfilter_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c new file mode 100644 index 0000000..881d7d1 --- /dev/null +++ b/net/bridge/br_netlink.c @@ -0,0 +1,199 @@ +/* + * Bridge netlink control interface + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "br_private.h" + +/* + * Create one netlink message for one interface + * Contains port and master info as well as carrier and bridge state. + */ +static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, + u32 pid, u32 seq, int event, unsigned int flags) +{ + const struct net_bridge *br = port->br; + const struct net_device *dev = port->dev; + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + u32 mtu = dev->mtu; + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + u8 portstate = port->state; + + pr_debug("br_fill_info event %d port %s master %s\n", + event, dev->name, br->dev->name); + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); + r = NLMSG_DATA(nlh); + r->ifi_family = AF_BRIDGE; + r->__ifi_pad = 0; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = 0; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + RTA_PUT(skb, IFLA_MASTER, sizeof(int), &br->dev->ifindex); + + if (dev->addr_len) + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + + + RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate); + + if (event == RTM_NEWLINK) + RTA_PUT(skb, IFLA_PROTINFO, sizeof(portstate), &portstate); + + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: +rtattr_failure: + + skb_trim(skb, b - skb->data); + return -EINVAL; +} + +/* + * Notify listeners of a change in port information + */ +void br_ifinfo_notify(int event, struct net_bridge_port *port) +{ + struct sk_buff *skb; + int err = -ENOMEM; + + pr_debug("bridge notify event=%d\n", event); + skb = alloc_skb(NLMSG_SPACE(sizeof(struct ifinfomsg) + 128), + GFP_ATOMIC); + if (!skb) + goto err_out; + + err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0); + if (err) + goto err_kfree; + + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC); + return; + +err_kfree: + kfree_skb(skb); +err_out: + netlink_set_err(rtnl, 0, RTNLGRP_LINK, err); +} + +/* + * Dump information about all ports, in response to GETLINK + */ +static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net_device *dev; + int idx; + int s_idx = cb->args[0]; + int err = 0; + + read_lock(&dev_base_lock); + for (dev = dev_base, idx = 0; dev; dev = dev->next) { + struct net_bridge_port *p = dev->br_port; + + /* not a bridge port */ + if (!p) + continue; + + if (idx < s_idx) + continue; + + err = br_fill_ifinfo(skb, p, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI); + if (err <= 0) + break; + ++idx; + } + read_unlock(&dev_base_lock); + + cb->args[0] = idx; + + return skb->len; +} + +/* + * Change state of port (ie from forwarding to blocking etc) + * Used by spanning tree in user space. + */ +static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + struct net_device *dev; + struct net_bridge_port *p; + u8 new_state; + + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + /* Must pass valid state as PROTINFO */ + if (rta[IFLA_PROTINFO-1]) { + u8 *pstate = RTA_DATA(rta[IFLA_PROTINFO-1]); + new_state = *pstate; + } else + return -EINVAL; + + if (new_state > BR_STATE_BLOCKING) + return -EINVAL; + + /* Find bridge port */ + dev = __dev_get_by_index(ifm->ifi_index); + if (!dev) + return -ENODEV; + + p = dev->br_port; + if (!p) + return -EINVAL; + + /* if kernel STP is running, don't allow changes */ + if (p->br->stp_enabled) + return -EBUSY; + + if (!netif_running(dev)) + return -ENETDOWN; + + if (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED) + return -ENETDOWN; + + p->state = new_state; + br_log_state(p); + return 0; +} + + +static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = { + [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, }, + [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, }, +}; + +void __init br_netlink_init(void) +{ + rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table; +} + +void __exit br_netlink_fini(void) +{ + rtnetlink_links[PF_BRIDGE] = NULL; +} + diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index a43a9c1..2027849 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -14,6 +14,7 @@ */ #include +#include #include "br_private.h" @@ -49,6 +50,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v case NETDEV_CHANGEADDR: br_fdb_changeaddr(p, dev->dev_addr); + br_ifinfo_notify(RTM_NEWLINK, p); br_stp_recalculate_bridge_id(br); break; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 22071d1..c491fb2 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -29,7 +29,7 @@ #define BR_PORT_DEBOUNCE (HZ/10) -#define BR_VERSION "2.1" +#define BR_VERSION "2.2" typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; @@ -237,6 +237,11 @@ extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); +/* br_netlink.c */ +extern void br_netlink_init(void); +extern void br_netlink_fini(void); +extern void br_ifinfo_notify(int event, struct net_bridge_port *port); + #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ extern struct sysfs_ops brport_sysfs_ops; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 23dea14..14cd025 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "br_private.h" #include "br_private_stp.h" @@ -86,6 +87,7 @@ void br_stp_disable_bridge(struct net_bridge *br) void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); + br_ifinfo_notify(RTM_NEWLINK, p); br_port_state_selection(p->br); } @@ -99,6 +101,8 @@ void br_stp_disable_port(struct net_bridge_port *p) printk(KERN_INFO "%s: port %i(%s) entering %s state\n", br->dev->name, p->port_no, p->dev->name, "disabled"); + br_ifinfo_notify(RTM_DELLINK, p); + wasroot = br_is_root_bridge(br); br_become_designated_port(p); p->state = BR_STATE_DISABLED; -- cgit v1.1 From 15986e1aadbbf40a331cddd0470bb434d156431d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 25 May 2006 16:11:14 -0700 Subject: [TCP]: tcp_rcv_rtt_measure_ts() call in pure-ACK path is superfluous We only want to take receive RTT mesaurements for data bearing frames, here in the header prediction fast path for a pure-sender, we know that we have a pure-ACK and thus the checks in tcp_rcv_rtt_mesaure_ts() will not pass. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c6d62f0..6d16788 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3931,8 +3931,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); - /* We know that such packets are checksummed * on entry. */ -- cgit v1.1 From 546be2405be119ef55467aace45f337a16e5d424 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:03:58 -0700 Subject: [IPSEC] xfrm: Undo afinfo lock proliferation The number of locks used to manage afinfo structures can easily be reduced down to one each for policy and state respectively. This is based on the observation that the write locks are only held by module insertion/removal which are very rare events so there is no need to further differentiate between the insertion of modules like ipv6 versus esp6. The removal of the read locks in xfrm4_policy.c/xfrm6_policy.c might look suspicious at first. However, after you realise that nobody ever takes the corresponding write lock you'll feel better :) As far as I can gather it's an attempt to guard against the removal of the corresponding modules. Since neither module can be unloaded at all we can leave it to whoever fixes up IPv6 unloading :) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/xfrm4_policy.c | 6 ----- net/ipv4/xfrm4_state.c | 1 - net/ipv6/xfrm6_policy.c | 6 ----- net/ipv6/xfrm6_state.c | 1 - net/xfrm/xfrm_policy.c | 58 +++++++++++++++++++++++++++++-------------------- net/xfrm/xfrm_state.c | 9 +++----- 6 files changed, 37 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8604c74..c046528 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,8 +17,6 @@ static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { return __ip_route_output_key((struct rtable**)dst, fl); @@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) static inline int xfrm4_garbage_collect(void) { - read_lock(&xfrm4_policy_afinfo.lock); xfrm4_policy_afinfo.garbage_collect(); - read_unlock(&xfrm4_policy_afinfo.lock); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); } @@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm4_type_map, .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .find_bundle = __xfrm4_find_bundle, diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index dbabf81..81e1751 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .state_lookup = __xfrm4_state_lookup, diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 88c840f..ee715f2 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -23,8 +23,6 @@ static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { int err = 0; @@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) static inline int xfrm6_garbage_collect(void) { - read_lock(&xfrm6_policy_afinfo.lock); xfrm6_policy_afinfo.garbage_collect(); - read_unlock(&xfrm6_policy_afinfo.lock); return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); } @@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = { static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm6_type_map, .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .find_bundle = __xfrm6_find_bundle, diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index a572302..b33296b 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, .init_tempsel = __xfrm6_init_tempsel, .state_lookup = __xfrm6_state_lookup, .find_acq = __xfrm6_find_acq, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b469c8b..44b64a5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family); +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo); int xfrm_register_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (likely(typemap->map[type->proto] == NULL)) - typemap->map[type->proto] = type; + if (likely(typemap[type->proto] == NULL)) + typemap[type->proto] = type; else err = -EEXIST; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_register_type); int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (unlikely(typemap->map[type->proto] != type)) + if (unlikely(typemap[type->proto] != type)) err = -ENOENT; else - typemap->map[type->proto] = NULL; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + typemap[type->proto] = NULL; + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type); struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { struct xfrm_policy_afinfo *afinfo; - struct xfrm_type_map *typemap; + struct xfrm_type **typemap; struct xfrm_type *type; int modload_attempted = 0; @@ -102,11 +100,9 @@ retry: return NULL; typemap = afinfo->type_map; - read_lock(&typemap->lock); - type = typemap->map[proto]; + type = typemap[proto]; if (unlikely(type && !try_module_get(type->owner))) type = NULL; - read_unlock(&typemap->lock); if (!type && !modload_attempted) { xfrm_policy_put_afinfo(afinfo); request_module("xfrm-type-%d-%d", @@ -1306,17 +1302,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) return NULL; read_lock(&xfrm_policy_afinfo_lock); afinfo = xfrm_policy_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_policy_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_policy_afinfo_lock); return afinfo; } static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_policy_afinfo_lock); +} + +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + write_lock_bh(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (unlikely(!afinfo)) + write_unlock_bh(&xfrm_policy_afinfo_lock); + return afinfo; +} + +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + write_unlock_bh(&xfrm_policy_afinfo_lock); } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 93a2f36..ee62c23 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1103,17 +1103,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family) return NULL; read_lock(&xfrm_state_afinfo_lock); afinfo = xfrm_state_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_state_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_state_afinfo_lock); return afinfo; } static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_state_afinfo_lock); } /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -- cgit v1.1 From b59f45d0b2878ab76f8053b0973654e6621828ee Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:05:54 -0700 Subject: [IPSEC] xfrm: Abstract out encapsulation modes This patch adds the structure xfrm_mode. It is meant to represent the operations carried out by transport/tunnel modes. By doing this we allow additional encapsulation modes to be added without clogging up the xfrm_input/xfrm_output paths. Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and BEET modes. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 18 ++++++ net/ipv4/Makefile | 2 + net/ipv4/xfrm4_input.c | 28 +-------- net/ipv4/xfrm4_mode_transport.c | 69 ++++++++++++++++++++++ net/ipv4/xfrm4_mode_tunnel.c | 125 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/xfrm4_output.c | 61 +------------------- net/ipv6/Kconfig | 20 +++++++ net/ipv6/Makefile | 2 + net/ipv6/ip6_output.c | 2 + net/ipv6/xfrm6_input.c | 29 +--------- net/ipv6/xfrm6_mode_transport.c | 73 +++++++++++++++++++++++ net/ipv6/xfrm6_mode_tunnel.c | 121 ++++++++++++++++++++++++++++++++++++++ net/ipv6/xfrm6_output.c | 63 +------------------- net/xfrm/xfrm_policy.c | 83 ++++++++++++++++++++++++++ net/xfrm/xfrm_state.c | 6 ++ 15 files changed, 532 insertions(+), 170 deletions(-) create mode 100644 net/ipv4/xfrm4_mode_transport.c create mode 100644 net/ipv4/xfrm4_mode_tunnel.c create mode 100644 net/ipv6/xfrm6_mode_transport.c create mode 100644 net/ipv6/xfrm6_mode_tunnel.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e40f753..35eb70b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -414,6 +414,24 @@ config INET_TUNNEL tristate default n +config INET_XFRM_MODE_TRANSPORT + tristate "IP: IPsec transport mode" + default y + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET_XFRM_MODE_TUNNEL + tristate "IP: IPsec tunnel mode" + default y + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9ef50a0..4cc9448 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o +obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o +obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 3e174c8..817ed84 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) EXPORT_SYMBOL(xfrm4_rcv); -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; - - if (INET_ECN_is_ce(outer_iph->tos)) - IP_ECN_set_ce(inner_iph); -} - static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) { switch (nexthdr) { @@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) xfrm_vec[xfrm_nr++] = x; - iph = skb->nh.iph; + if (x->mode->input(x, skb)) + goto drop; if (x->props.mode) { - if (iph->protocol != IPPROTO_IPIP) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); decaps = 1; break; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 0000000..e46d9a4 --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c @@ -0,0 +1,69 @@ +/* + * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x; + struct iphdr *iph; + int ihl; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + ihl = iph->ihl * 4; + skb->h.raw += ihl; + + x = skb->dst->xfrm; + skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + return 0; +} + +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_mode xfrm4_transport_mode = { + .input = xfrm4_transport_input, + .output = xfrm4_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm4_transport_init(void) +{ + return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); +} + +static void __exit xfrm4_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_transport_init); +module_exit(xfrm4_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 0000000..f8d880b --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -0,0 +1,125 @@ +/* + * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + int flags; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (iph->frag_off & htons(IP_DF)); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst->child, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + return 0; +} + +static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int err = -EINVAL; + + if (iph->protocol != IPPROTO_IPIP) + goto out; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm4_tunnel_mode = { + .input = xfrm4_tunnel_input, + .output = xfrm4_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm4_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); +} + +static void __exit xfrm4_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_tunnel_init); +module_exit(xfrm4_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 4ef8efa..ac9d91d 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -12,67 +12,10 @@ #include #include #include -#include #include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header will be moved forward to make space - * for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. - */ -static void xfrm4_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph, *top_iph; - int flags; - - iph = skb->nh.iph; - skb->h.ipiph = iph; - - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; - - if (!x->props.mode) { - skb->h.raw += iph->ihl*4; - memmove(top_iph, iph, iph->ihl*4); - return; - } - - top_iph->ihl = 5; - top_iph->version = 4; - - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); - - top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - top_iph->protocol = IPPROTO_IPIP; - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -} - static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb) if (err) goto error; - xfrm4_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index f8a107a..e923d4d 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -106,6 +106,26 @@ config INET6_TUNNEL tristate default n +config INET6_XFRM_MODE_TRANSPORT + tristate "IPv6: IPsec transport mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_TUNNEL + tristate "IPv6: IPsec tunnel mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config IPV6_TUNNEL tristate "IPv6: IPv6-in-IPv6 tunnel" select INET6_TUNNEL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index a760b09..386e0a6 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o +obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o +obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e460489..416f6e4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } +EXPORT_SYMBOL_GPL(ip6_find_1stfragopt); static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00cfdee..0405d74 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -13,21 +13,9 @@ #include #include #include -#include -#include -#include #include #include -static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) -{ - struct ipv6hdr *outer_iph = skb->nh.ipv6h; - struct ipv6hdr *inner_iph = skb->h.ipv6h; - - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) - IP6_ECN_set_ce(inner_iph); -} - int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) { int err; @@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) xfrm_vec[xfrm_nr++] = x; + if (x->mode->input(x, skb)) + goto drop; + if (x->props.mode) { /* XXX */ - if (nexthdr != IPPROTO_IPV6) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; decaps = 1; break; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c new file mode 100644 index 0000000..5efbbae --- /dev/null +++ b/net/ipv6/xfrm6_mode_transport.c @@ -0,0 +1,73 @@ +/* + * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x = skb->dst->xfrm; + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + return 0; +} + +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_mode xfrm6_transport_mode = { + .input = xfrm6_transport_input, + .output = xfrm6_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm6_transport_init(void) +{ + return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); +} + +static void __exit xfrm6_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_transport_init); +module_exit(xfrm6_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c new file mode 100644 index 0000000..8af79be2 --- /dev/null +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -0,0 +1,121 @@ +/* + * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph, *top_iph; + int dsfield; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + dsfield = ipv6_get_dsfield(top_iph); + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->nexthdr = IPPROTO_IPV6; + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm6_tunnel_mode = { + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm6_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); +} + +static void __exit xfrm6_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8024217..16e8425 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -14,68 +14,9 @@ #include #include #include -#include -#include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header and mutable extension headers will be moved - * forward to make space for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * payload_len - * - * On exit, skb->h will be set to the start of the encapsulation header to be - * filled in by x->type->output and skb->nh will be set to the nextheader field - * of the extension header directly preceding the encapsulation header, or in - * its absence, that of the top IP header. The value of skb->data will always - * point to the top IP header. - */ -static void xfrm6_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct ipv6hdr *iph, *top_iph; - int dsfield; - - skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; - - if (!x->props.mode) { - u8 *prevhdr; - int hdr_len; - - hdr_len = ip6_find_1stfragopt(skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; - memmove(skb->data, iph, hdr_len); - return; - } - - skb->nh.raw = skb->data; - top_iph = skb->nh.ipv6h; - skb->nh.raw = &top_iph->nexthdr; - skb->h.ipv6h = top_iph + 1; - - top_iph->version = 6; - top_iph->priority = iph->priority; - top_iph->flow_lbl[0] = iph->flow_lbl[0]; - top_iph->flow_lbl[1] = iph->flow_lbl[1]; - top_iph->flow_lbl[2] = iph->flow_lbl[2]; - dsfield = ipv6_get_dsfield(top_iph); - dsfield = INET_ECN_encapsulate(dsfield, dsfield); - if (x->props.flags & XFRM_STATE_NOECN) - dsfield &= ~INET_ECN_MASK; - ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->nexthdr = IPPROTO_IPV6; - top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); -} - static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb) if (err) goto error; - xfrm6_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 44b64a5..b893692 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -138,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type) module_put(type->owner); } +int xfrm_register_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -EEXIST; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == NULL)) { + modemap[mode->encap] = mode; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_mode); + +int xfrm_unregister_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -ENOENT; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == mode)) { + modemap[mode->encap] = NULL; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_mode); + +struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode *mode; + int modload_attempted = 0; + + if (unlikely(encap >= XFRM_MODE_MAX)) + return NULL; + +retry: + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + + mode = afinfo->mode_map[encap]; + if (unlikely(mode && !try_module_get(mode->owner))) + mode = NULL; + if (!mode && !modload_attempted) { + xfrm_policy_put_afinfo(afinfo); + request_module("xfrm-mode-%d-%d", family, encap); + modload_attempted = 1; + goto retry; + } + + xfrm_policy_put_afinfo(afinfo); + return mode; +} + +void xfrm_put_mode(struct xfrm_mode *mode) +{ + module_put(mode->owner); +} + static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ee62c23..17b29ec 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) kfree(x->ealg); kfree(x->calg); kfree(x->encap); + if (x->mode) + xfrm_put_mode(x->mode); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -1193,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x) if (err) goto error; + x->mode = xfrm_get_mode(x->props.mode, family); + if (x->mode == NULL) + goto error; + x->km.state = XFRM_STATE_VALID; error: -- cgit v1.1 From 31a4ab93025719e62e7cf7ce899f71c34ecde5a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:06:13 -0700 Subject: [IPSEC] proto: Move transport mode input path into xfrm_mode_transport Now that we have xfrm_mode objects we can move the transport mode specific input decapsulation code into xfrm_mode_transport. This removes duplicate code as well as unnecessary header movement in case of tunnel mode SAs since we will discard the original IP header immediately. This also fixes a minor bug for transport-mode ESP where the IP payload length is set to the correct value minus the header length (with extension headers for IPv6). Of course the other neat thing is that we no longer have to allocate temporary buffers to hold the IP headers for ESP and IPComp. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 15 +++++++-------- net/ipv4/esp4.c | 18 ++++++------------ net/ipv4/ipcomp.c | 23 +++++------------------ net/ipv4/xfrm4_mode_transport.c | 14 ++++++++++++++ net/ipv6/ah6.c | 10 +++------- net/ipv6/esp6.c | 20 ++++---------------- net/ipv6/ipcomp6.c | 27 +++++---------------------- net/ipv6/xfrm6_mode_transport.c | 15 +++++++++++++++ 8 files changed, 59 insertions(+), 83 deletions(-) (limited to 'net') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e2e4771..c778223 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -119,6 +119,7 @@ error: static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; + int ihl; struct iphdr *iph; struct ip_auth_hdr *ah; struct ah_data *ahp; @@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr*)skb->data; iph = skb->nh.iph; - memcpy(work_buf, iph, iph->ihl*4); + ihl = skb->data - skb->nh.raw; + memcpy(work_buf, iph, ihl); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; - if (iph->ihl != 5) { + if (ihl > sizeof(*iph)) { u32 dummy; if (ip_clear_mutable_options(iph, &dummy)) goto out; @@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) u8 auth_data[MAX_AH_AUTH_LEN]; memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, ihl); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { x->stats.integrity_failed++; @@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, work_buf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); - skb_pull(skb, skb->nh.iph->ihl*4); - skb->h.raw = skb->data; + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + __skb_pull(skb, ah_hlen + ihl); return 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9d1881c..9bbdd449 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) int alen = esp->auth.icv_trunc_len; int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; int nfrags; - int encap_len = 0; + int ihl; u8 nexthdr[2]; struct scatterlist *sg; - u8 workbuf[60]; int padlen; if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) @@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr*)skb->data; - iph = skb->nh.iph; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - - uh = (struct udphdr *)(iph + 1); - encap_len = (void*)esph - (void*)uh; + struct udphdr *uh = (void *)(skb->nh.raw + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - memcpy(workbuf, skb->nh.raw, iph->ihl*4); - skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, workbuf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; return 0; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 95278b2..8e24358 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { int err, plen, dlen; - struct iphdr *iph; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; struct crypto_tfm *tfm; @@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); - iph = skb->nh.iph; - iph->tot_len = htons(dlen + iph->ihl * 4); out: put_cpu(); return err; @@ -83,14 +80,9 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - u8 nexthdr; int err = 0; struct iphdr *iph; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; - + struct ip_comp_hdr *ipch; if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && skb_linearize(skb, GFP_ATOMIC) != 0) { @@ -102,15 +94,10 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) /* Remove ipcomp header and decompress original payload */ iph = skb->nh.iph; - memcpy(&tmp_iph, iph, iph->ihl * 4); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ip_comp_hdr)); - skb->nh.raw += sizeof(struct ip_comp_hdr); - memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); - iph = skb->nh.iph; - iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); - iph->protocol = nexthdr; - skb->h.raw = skb->data; + ipch = (void *)skb->data; + iph->protocol = ipch->nexthdr; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); out: diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index e46d9a4..a9e6b3d 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -38,8 +38,22 @@ static int xfrm4_transport_output(struct sk_buff *skb) return 0; } +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.iph->tot_len = htons(skb->len + ihl); + skb->h.raw = skb->data; return 0; } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 6778173..d31c0d6 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, hdr_len); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); @@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) } } - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); - skb_pull(skb, hdr_len); - skb->h.raw = skb->data; - + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); kfree(tmp_hdr); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 22f0460..a15a6f3 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) int hdr_len = skb->h.raw - skb->nh.raw; int nfrags; - unsigned char *tmp_hdr = NULL; int ret = 0; if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { ret = -EINVAL; - goto out_nofree; + goto out; } if (elen <= 0 || (elen & (blksize-1))) { ret = -EINVAL; - goto out_nofree; - } - - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) { - ret = -ENOMEM; - goto out_nofree; + goto out; } - memcpy(tmp_hdr, skb->nh.raw, hdr_len); /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { @@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ pskb_trim(skb, skb->len - alen - padlen - 2); - skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ret = nexthdr[1]; } + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len; + out: - kfree(tmp_hdr); -out_nofree: return ret; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 4863643..cec3be5 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -66,10 +66,8 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; - u8 nexthdr = 0; - int hdr_len = skb->h.raw - skb->nh.raw; - unsigned char *tmp_hdr = NULL; struct ipv6hdr *iph; + struct ipv6_comp_hdr *ipch; int plen, dlen; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; @@ -86,17 +84,9 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) /* Remove ipcomp header and decompress original payload */ iph = skb->nh.ipv6h; - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) - goto out; - memcpy(tmp_hdr, iph, hdr_len); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ipv6_comp_hdr)); - skb->nh.raw += sizeof(struct ipv6_comp_hdr); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - iph = skb->nh.ipv6h; - iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); - skb->h.raw = skb->data; + ipch = (void *)skb->data; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); /* decompression */ plen = skb->len; @@ -125,18 +115,11 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); + err = ipch->nexthdr; - iph = skb->nh.ipv6h; - iph->payload_len = htons(skb->len); - out_put_cpu: put_cpu(); out: - kfree(tmp_hdr); - if (err) - goto error_out; - return nexthdr; -error_out: return err; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 5efbbae..711d713 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -42,8 +42,23 @@ static int xfrm6_transport_output(struct sk_buff *skb) return 0; } +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.ipv6h->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb->h.raw = skb->data; return 0; } -- cgit v1.1 From 3e72b2fe5b31791f976350b023b7a37ef59c02c1 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:19:19 -0700 Subject: [NETFILTER]: x_tables: remove some unnecessary casts Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 2 +- net/netfilter/xt_connmark.c | 2 +- net/netfilter/xt_dccp.c | 3 +-- net/netfilter/xt_mark.c | 2 +- net/netfilter/xt_sctp.c | 4 +--- net/netfilter/xt_string.c | 2 +- 6 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 7c6836c..b88adc7 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -561,7 +561,7 @@ static void hashlimit_destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) { - struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + struct ipt_hashlimit_info *r = matchinfo; htable_put(r->hinfo); } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index dc26a27..56324c8a 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -58,7 +58,7 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { - struct xt_connmark_info *cm = (struct xt_connmark_info *)matchinfo; + struct xt_connmark_info *cm = matchinfo; if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { printk(KERN_WARNING "connmark: only support 32bit mark\n"); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index dfb10b6..2e2f825 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -101,8 +101,7 @@ match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { - const struct xt_dccp_info *info = - (const struct xt_dccp_info *)matchinfo; + const struct xt_dccp_info *info = matchinfo; struct dccp_hdr _dh, *dh; if (offset) diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 8b385a3..876bc57 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -42,7 +42,7 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { - struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo; + const struct xt_mark_info *minfo = matchinfo; if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { printk(KERN_WARNING "mark: only supports 32bit mark\n"); diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 34bd872..b5110e5 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -129,11 +129,9 @@ match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { - const struct xt_sctp_info *info; + const struct xt_sctp_info *info = matchinfo; sctp_sctphdr_t _sh, *sh; - info = (const struct xt_sctp_info *)matchinfo; - if (offset) { duprintf("Dropping non-first fragment.. FIXME\n"); return 0; diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 79d9ea6..0ebb6ac 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -30,8 +30,8 @@ static int match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { + const struct xt_string_info *conf = matchinfo; struct ts_state state; - struct xt_string_info *conf = (struct xt_string_info *) matchinfo; memset(&state, 0, sizeof(struct ts_state)); -- cgit v1.1 From 957dc80ac30f3c4d53259fa936df807663ba54fa Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:19:56 -0700 Subject: [NETFILTER]: x_tables: add SCTP/DCCP support where missing Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 20 ++++-------- net/ipv4/netfilter/ipt_hashlimit.c | 64 ++++++++++---------------------------- net/netfilter/xt_multiport.c | 7 +++-- 3 files changed, 26 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index aad9d28..dbc83c5 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) struct iphdr *iph = skb->nh.iph; unsigned long hashval; u_int16_t sport, dport; - struct tcphdr *th; - struct udphdr *uh; - struct icmphdr *ih; + u_int16_t *ports; switch (iph->protocol) { case IPPROTO_TCP: - th = (void *)iph+iph->ihl*4; - sport = ntohs(th->source); - dport = ntohs(th->dest); - break; case IPPROTO_UDP: - uh = (void *)iph+iph->ihl*4; - sport = ntohs(uh->source); - dport = ntohs(uh->dest); - break; + case IPPROTO_SCTP: + case IPPROTO_DCCP: case IPPROTO_ICMP: - ih = (void *)iph+iph->ihl*4; - sport = ntohs(ih->un.echo.id); - dport = (ih->type<<8)|ih->code; + ports = (void *)iph+iph->ihl*4; + sport = ports[0]; + dport = ports[1]; break; default: if (net_ratelimit()) { diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index b88adc7..85edfb7 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -28,9 +28,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -381,49 +378,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) dh->rateinfo.credit = dh->rateinfo.credit_cap; } -static inline int get_ports(const struct sk_buff *skb, int offset, - u16 ports[2]) -{ - union { - struct tcphdr th; - struct udphdr uh; - sctp_sctphdr_t sctph; - } hdr_u, *ptr_u; - - /* Must not be a fragment. */ - if (offset) - return 1; - - /* Must be big enough to read ports (both UDP and TCP have - them at the start). */ - ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); - if (!ptr_u) - return 1; - - switch (skb->nh.iph->protocol) { - case IPPROTO_TCP: - ports[0] = ptr_u->th.source; - ports[1] = ptr_u->th.dest; - break; - case IPPROTO_UDP: - ports[0] = ptr_u->uh.source; - ports[1] = ptr_u->uh.dest; - break; - case IPPROTO_SCTP: - ports[0] = ptr_u->sctph.source; - ports[1] = ptr_u->sctph.dest; - break; - default: - /* all other protocols don't supprot per-port hash - * buckets */ - ports[0] = ports[1] = 0; - break; - } - - return 0; -} - - static int hashlimit_match(const struct sk_buff *skb, const struct net_device *in, @@ -449,8 +403,22 @@ hashlimit_match(const struct sk_buff *skb, dst.src_ip = skb->nh.iph->saddr; if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { - u_int16_t ports[2]; - if (get_ports(skb, offset, ports)) { + u_int16_t _ports[2], *ports; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + ports = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), &_ports); + break; + default: + _ports[0] = _ports[1] = 0; + ports = _ports; + break; + } + if (!ports) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ *hotdrop = 1; diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index b56cd2b..1ff0a25 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -1,4 +1,4 @@ -/* Kernel module to match one of a list of TCP/UDP ports: ports are in +/* Kernel module to match one of a list of TCP/UDP/SCTP/DCCP ports: ports are in the same place so we can treat them as equal. */ /* (C) 1999-2001 Paul `Rusty' Russell @@ -160,8 +160,9 @@ check(u_int16_t proto, u_int8_t match_flags, u_int8_t count) { - /* Must specify proto == TCP/UDP, no unknown flags or bad count */ - return (proto == IPPROTO_TCP || proto == IPPROTO_UDP) + /* Must specify supported protocol, no unknown flags or bad count */ + return (proto == IPPROTO_TCP || proto == IPPROTO_UDP + || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) && !(ip_invflags & XT_INV_PROTO) && (match_flags == XT_MULTIPORT_SOURCE || match_flags == XT_MULTIPORT_DESTINATION -- cgit v1.1 From 62b7743483b402f8fb73545d5d487ca714e82766 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:20:32 -0700 Subject: [NETFILTER]: x_tables: add quota match Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 10 +++++ net/netfilter/Makefile | 1 + net/netfilter/xt_quota.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 net/netfilter/xt_quota.c (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e2893ef..5543c7b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -329,6 +329,16 @@ config NETFILTER_XT_MATCH_PKTTYPE To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_QUOTA + tristate '"quota" match support' + depends on NETFILTER_XTABLES + help + This option adds a `quota' match, which allows to match on a + byte counter. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 95b7e41..4b6a6ea 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c new file mode 100644 index 0000000..4cdba74 --- /dev/null +++ b/net/netfilter/xt_quota.c @@ -0,0 +1,96 @@ +/* + * netfilter module to enforce network quotas + * + * Sam Johnston + */ +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sam Johnston "); + +static DEFINE_SPINLOCK(quota_lock); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) +{ + struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master; + int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0; + + spin_lock_bh("a_lock); + if (q->quota >= skb->len) { + q->quota -= skb->len; + ret ^= 1; + } else { + /* we do not allow even small packets from now on */ + q->quota = 0; + } + spin_unlock_bh("a_lock); + + return ret; +} + +static int +checkentry(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + struct xt_quota_info *q = (struct xt_quota_info *)matchinfo; + + if (q->flags & ~XT_QUOTA_MASK) + return 0; + /* For SMP, we only want to use one set of counters. */ + q->master = q; + return 1; +} + +static struct xt_match quota_match = { + .name = "quota", + .family = AF_INET, + .match = match, + .matchsize = sizeof(struct xt_quota_info), + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static struct xt_match quota_match6 = { + .name = "quota", + .family = AF_INET6, + .match = match, + .matchsize = sizeof(struct xt_quota_info), + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init xt_quota_init(void) +{ + int ret; + + ret = xt_register_match("a_match); + if (ret) + goto err1; + ret = xt_register_match("a_match6); + if (ret) + goto err2; + return ret; + +err2: + xt_unregister_match("a_match); +err1: + return ret; +} + +static void __exit xt_quota_fini(void) +{ + xt_unregister_match("a_match6); + xt_unregister_match("a_match); +} + +module_init(xt_quota_init); +module_exit(xt_quota_fini); -- cgit v1.1 From f3389805e53a13bd969ee1c8fc5a4137b7c6c167 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:00 -0700 Subject: [NETFILTER]: x_tables: add statistic match Add statistic match which is a combination of the nth and random matches. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 6 +++ net/netfilter/Makefile | 1 + net/netfilter/xt_statistic.c | 112 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 net/netfilter/xt_statistic.c (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 5543c7b..85a7e17 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -375,6 +375,12 @@ config NETFILTER_XT_MATCH_STATE To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' + depends on NETFILTER_XTABLES + help + statistic module + config NETFILTER_XT_MATCH_STRING tristate '"string" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4b6a6ea..df1da25f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c new file mode 100644 index 0000000..de1037f --- /dev/null +++ b/net/netfilter/xt_statistic.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on ipt_random and ipt_nth by Fabrice MARIE . + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("xtables statistical match module"); +MODULE_ALIAS("ipt_statistic"); +MODULE_ALIAS("ip6t_statistic"); + +static DEFINE_SPINLOCK(nth_lock); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) +{ + struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0; + + switch (info->mode) { + case XT_STATISTIC_MODE_RANDOM: + if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) + ret ^= 1; + break; + case XT_STATISTIC_MODE_NTH: + info = info->master; + spin_lock_bh(&nth_lock); + if (info->u.nth.count++ == info->u.nth.every) { + info->u.nth.count = 0; + ret ^= 1; + } + spin_unlock_bh(&nth_lock); + break; + } + + return ret; +} + +static int +checkentry(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + + if (info->mode > XT_STATISTIC_MODE_MAX || + info->flags & ~XT_STATISTIC_MASK) + return 0; + info->master = info; + return 1; +} + +static struct xt_match statistic_match = { + .name = "statistic", + .match = match, + .matchsize = sizeof(struct xt_statistic_info), + .checkentry = checkentry, + .family = AF_INET, + .me = THIS_MODULE, +}; + +static struct xt_match statistic_match6 = { + .name = "statistic", + .match = match, + .matchsize = sizeof(struct xt_statistic_info), + .checkentry = checkentry, + .family = AF_INET6, + .me = THIS_MODULE, +}; + +static int __init xt_statistic_init(void) +{ + int ret; + + ret = xt_register_match(&statistic_match); + if (ret) + goto err1; + + ret = xt_register_match(&statistic_match6); + if (ret) + goto err2; + return ret; +err2: + xt_unregister_match(&statistic_match); +err1: + return ret; +} + +static void __exit xt_statistic_fini(void) +{ + xt_unregister_match(&statistic_match6); + xt_unregister_match(&statistic_match); +} + +module_init(xt_statistic_init); +module_exit(xt_statistic_fini); -- cgit v1.1 From 404bdbfd242cb99ca0e9d3eb5fbb5bcd54123081 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:34 -0700 Subject: [NETFILTER]: recent match: replace by rewritten version Replace the unmaintainable ipt_recent match by a rewritten version that should be fully compatible. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1268 ++++++++++++--------------------------- 1 file changed, 377 insertions(+), 891 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index b847ee4..9686c4d 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -1,1007 +1,493 @@ -/* Kernel module to check if the source address has been seen recently. */ -/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ -/* Author: Stephen Frost */ -/* Project Page: http://snowman.net/projects/ipt_recent/ */ -/* This software is distributed under the terms of the GPL, Version 2 */ -/* This copyright does not cover user programs that use kernel services - * by normal system calls. */ - -#include -#include +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#include +#include #include -#include -#include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include #include -#undef DEBUG -#define HASH_LOG 9 +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("IP tables recently seen matching module"); +MODULE_LICENSE("GPL"); -/* Defaults, these can be overridden on the module command-line. */ static unsigned int ip_list_tot = 100; static unsigned int ip_pkt_list_tot = 20; static unsigned int ip_list_hash_size = 0; static unsigned int ip_list_perms = 0644; -#ifdef DEBUG -static int debug = 1; -#endif - -static char version[] = -KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost . http://snowman.net/projects/ipt_recent/\n"; - -MODULE_AUTHOR("Stephen Frost "); -MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); -MODULE_LICENSE("GPL"); module_param(ip_list_tot, uint, 0400); module_param(ip_pkt_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); -#ifdef DEBUG -module_param(debug, bool, 0600); -MODULE_PARM_DESC(debug,"enable debugging output"); -#endif -MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); -MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); -MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); - -/* Structure of our list of recently seen addresses. */ -struct recent_ip_list { - u_int32_t addr; - u_int8_t ttl; - unsigned long last_seen; - unsigned long *last_pkts; - u_int32_t oldest_pkt; - u_int32_t hash_entry; - u_int32_t time_pos; -}; - -struct time_info_list { - u_int32_t position; - u_int32_t time; +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); + + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + u_int32_t addr; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; }; -/* Structure of our linked list of tables of recent lists. */ -struct recent_ip_tables { - char name[IPT_RECENT_NAME_LEN]; - int count; - int time_pos; - struct recent_ip_list *table; - struct recent_ip_tables *next; - spinlock_t list_lock; - int *hash_table; - struct time_info_list *time_info; +struct recent_table { + struct list_head list; + char name[IPT_RECENT_NAME_LEN]; #ifdef CONFIG_PROC_FS - struct proc_dir_entry *status_proc; -#endif /* CONFIG_PROC_FS */ + struct proc_dir_entry *proc; +#endif + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; }; -/* Our current list of addresses we have recently seen. - * Only added to on a --set, and only updated on --set || --update - */ -static struct recent_ip_tables *r_tables = NULL; - -/* We protect r_list with this spinlock so two processors are not modifying - * the list at the same time. - */ +static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); #ifdef CONFIG_PROC_FS -/* Our /proc/net/ipt_recent entry */ -static struct proc_dir_entry *proc_net_ipt_recent = NULL; -#endif - -/* Function declaration for later. */ -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop); - -/* Function to hash a given address into the hash table of table_size size */ -static int hash_func(unsigned int addr, int table_size) -{ - int result = 0; - unsigned int value = addr; - do { result ^= value; } while((value >>= HASH_LOG)); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", - result & (table_size - 1), - addr, - table_size); +static struct proc_dir_entry *proc_dir; +static struct file_operations recent_fops; #endif - return(result & (table_size - 1)); -} +static u_int32_t hash_rnd; +static int hash_rnd_initted; -#ifdef CONFIG_PROC_FS -/* This is the function which produces the output for our /proc output - * interface which lists each IP address, the last seen time and the - * other recent times the address was seen. - */ - -static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +static unsigned int recent_entry_hash(u_int32_t addr) { - int len = 0, count, last_len = 0, pkt_count; - off_t pos = 0; - off_t begin = 0; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - spin_lock_bh(&curr_table->list_lock); - for(count = 0; count < ip_list_tot; count++) { - if(!curr_table->table[count].addr) continue; - last_len = len; - len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); - len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); - len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); - len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); - len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); - for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(!curr_table->table[count].last_pkts[pkt_count]) break; - len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); - } - len += sprintf(buffer+len,"\n"); - pos = begin + len; - if(pos < offset) { len = 0; begin = pos; } - if(pos > offset + length) { len = last_len; break; } + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, 4); + hash_rnd_initted = 1; } - - *start = buffer + (offset - begin); - len -= (offset - begin); - if(len > length) len = length; - - spin_unlock_bh(&curr_table->list_lock); - return len; + return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1); } -/* ip_recent_ctrl provides an interface for users to modify the table - * directly. This allows adding entries, removing entries, and - * flushing the entire table. - * This is done by opening up the appropriate table for writing and - * sending one of: - * xx.xx.xx.xx -- Add entry to table with current time - * +xx.xx.xx.xx -- Add entry to table with current time - * -xx.xx.xx.xx -- Remove entry from table - * clear -- Flush table, remove all entries - */ - -static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl) { - static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; - u_int32_t val; - int base, used = 0; - char c, *cp; - union iaddr { - uint8_t bytes[4]; - uint32_t word; - } res; - uint8_t *pp = res.bytes; - int digit; - - char buffer[20]; - int len, check_set = 0, count; - u_int32_t addr = 0; - struct sk_buff *skb; - struct ipt_recent_info *info; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - if(size > 20) len = 20; else len = size; - - if(copy_from_user(buffer,input,len)) return -EFAULT; - - if(len < 20) buffer[len] = '\0'; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); -#endif + struct recent_entry *e; + unsigned int h; + + h = recent_entry_hash(addr); + list_for_each_entry(e, &table->iphash[h], list) + if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + return e; + return NULL; +} - cp = buffer; - while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} - /* Check if we are asked to flush the entire table */ - if(!memcmp(cp,"clear",5)) { - used += 5; - spin_lock_bh(&curr_table->list_lock); - curr_table->time_pos = 0; - for(count = 0; count < ip_list_hash_size; count++) { - curr_table->hash_table[count] = -1; - } - for(count = 0; count < ip_list_tot; count++) { - curr_table->table[count].last_seen = 0; - curr_table->table[count].addr = 0; - curr_table->table[count].ttl = 0; - memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - curr_table->table[count].oldest_pkt = 0; - curr_table->table[count].time_pos = 0; - curr_table->time_info[count].position = count; - curr_table->time_info[count].time = 0; - } - spin_unlock_bh(&curr_table->list_lock); - return used; - } +static struct recent_entry * +recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl) +{ + struct recent_entry *e; - check_set = IPT_RECENT_SET; - switch(*cp) { - case '+': check_set = IPT_RECENT_SET; cp++; used++; break; - case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; - default: if(!isdigit(*cp)) return (used+1); break; + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + e->addr = addr; + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); -#endif - /* Get addr (effectively inet_aton()) */ - /* Shamelessly stolen from libc, a function in the kernel for doing - * this would, of course, be greatly preferred, but our options appear - * to be rather limited, so we will just do it ourselves here. - */ - res.word = 0; - - c = *cp; - for(;;) { - if(!isdigit(c)) return used; - val = 0; base = 10; digit = 0; - if(c == '0') { - c = *++cp; - if(c == 'x' || c == 'X') base = 16, c = *++cp; - else { base = 8; digit = 1; } - } - for(;;) { - if(isascii(c) && isdigit(c)) { - if(base == 8 && (c == '8' || c == '0')) return used; - val = (val * base) + (c - '0'); - c = *++cp; - digit = 1; - } else if(base == 16 && isascii(c) && isxdigit(c)) { - val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); - c = *++cp; - digit = 1; - } else break; - } - if(c == '.') { - if(pp > res.bytes + 2 || val > 0xff) return used; - *pp++ = val; - c = *++cp; - } else break; - } - used = cp - buffer; - if(c != '\0' && (!isascii(c) || !isspace(c))) return used; - if(c == '\n') used++; - if(!digit) return used; +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + e->index %= ip_pkt_list_tot; + list_move_tail(&e->lru_list, &t->lru_list); +} - if(val > max[pp - res.bytes]) return used; - addr = res.word | htonl(val); +static struct recent_table *recent_table_lookup(const char *name) +{ + struct recent_table *t; - if(!addr && check_set == IPT_RECENT_SET) return used; + list_for_each_entry(t, &tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); -#endif +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; - /* Set up and just call match */ - info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); - if(!info) { return -ENOMEM; } - info->seconds = 0; - info->hit_count = 0; - info->check_set = check_set; - info->invert = 0; - info->side = IPT_RECENT_SOURCE; - strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); - info->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); - if (!skb) { - used = -ENOMEM; - goto out_free_info; - } - skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); - if (!skb->nh.iph) { - used = -ENOMEM; - goto out_free_skb; + for (i = 0; i < ip_list_hash_size; i++) { + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); } - - skb->nh.iph->saddr = addr; - skb->nh.iph->daddr = 0; - /* Clear ttl since we have no way of knowing it */ - skb->nh.iph->ttl = 0; - match(skb,NULL,NULL,NULL,info,0,0,NULL); - - kfree(skb->nh.iph); -out_free_skb: - kfree(skb); -out_free_info: - kfree(info); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); -#endif - return used; } -#endif /* CONFIG_PROC_FS */ - -/* 'match' is our primary function, called by the kernel whenever a rule is - * hit with our module as an option to it. - * What this function does depends on what was specifically asked of it by - * the user: - * --set -- Add or update last seen time of the source address of the packet - * -- matchinfo->check_set == IPT_RECENT_SET - * --rcheck -- Just check if the source address is in the list - * -- matchinfo->check_set == IPT_RECENT_CHECK - * --update -- If the source address is in the list, update last_seen - * -- matchinfo->check_set == IPT_RECENT_UPDATE - * --remove -- If the source address is in the list, remove it - * -- matchinfo->check_set == IPT_RECENT_REMOVE - * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds - * -- matchinfo->seconds - * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times - * -- matchinfo->hit_count - * --seconds and --hitcount can be combined - */ static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +ipt_recent_match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) { - int pkt_count, hits_found, ans; - unsigned long now; const struct ipt_recent_info *info = matchinfo; - u_int32_t addr = 0, time_temp; - u_int8_t ttl = skb->nh.iph->ttl; - int *hash_table; - int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; - struct time_info_list *time_info; - struct recent_ip_tables *curr_table; - struct recent_ip_tables *last_table; - struct recent_ip_list *r_list; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); -#endif - - /* Default is false ^ info->invert */ - ans = info->invert; + struct recent_table *t; + struct recent_entry *e; + u_int32_t addr; + u_int8_t ttl; + int ret = info->invert; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); -#endif + if (info->side == IPT_RECENT_DEST) + addr = skb->nh.iph->daddr; + else + addr = skb->nh.iph->saddr; - /* if out != NULL then routing has been done and TTL changed. - * We change it back here internally for match what came in before routing. */ - if(out) ttl++; + ttl = skb->nh.iph->ttl; + /* use TTL as seen before forwarding */ + if (out && !skb->sk) + ttl++; - /* Find the right table */ spin_lock_bh(&recent_lock); - curr_table = r_tables; - while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); -#endif - - spin_unlock_bh(&recent_lock); - - /* Table with this name not found, match impossible */ - if(!curr_table) { return ans; } - - /* Make sure no one is changing the list while we work with it */ - spin_lock_bh(&curr_table->list_lock); - - r_list = curr_table->table; - if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; - - if(!addr) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); -#endif - spin_unlock_bh(&curr_table->list_lock); - return ans; + t = recent_table_lookup(info->name); + e = recent_entry_lookup(t, addr, + info->check_set & IPT_RECENT_TTL ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & IPT_RECENT_SET)) + goto out; + e = recent_entry_init(t, addr, ttl); + if (e == NULL) + *hotdrop = 1; + ret ^= 1; + goto out; } -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); -#endif - - /* Get jiffies now in case they changed while we were waiting for a lock */ - now = jiffies; - hash_table = curr_table->hash_table; - time_info = curr_table->time_info; - - orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); - /* Hash entry at this result used */ - /* Check for TTL match if requested. If TTL is zero then a match would never - * happen, so match regardless of existing TTL in that case. Zero means the - * entry was added via the /proc interface anyway, so we will just use the - * first TTL we get for that IP address. */ - if(info->check_set & IPT_RECENT_TTL) { - while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && - (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } else { - while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } - - if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { - /* IP not in list and not asked to SET */ - spin_unlock_bh(&curr_table->list_lock); - return ans; - } - - /* Check if we need to handle the collision, do not need to on REMOVE */ - if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", - orig_hash_result, - hash_result, - r_list[hash_table[orig_hash_result]].addr, - addr); -#endif - - /* We had a collision. - * orig_hash_result is where we started, hash_result is where we ended up. - * So, swap them because we are likely to see the same guy again sooner */ -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); - printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", - r_list[hash_table[orig_hash_result]].hash_entry); - } -#endif - - r_list[hash_table[orig_hash_result]].hash_entry = hash_result; - - - temp = hash_table[orig_hash_result]; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); -#endif - hash_table[orig_hash_result] = hash_table[hash_result]; - hash_table[hash_result] = temp; - temp = hash_result; - hash_result = orig_hash_result; - orig_hash_result = temp; - time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; - if(hash_table[hash_result] != -1) { - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); -#endif - } - - if(hash_table[hash_result] == -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", - hash_result, addr); -#endif - - /* New item found and IPT_RECENT_SET, so we need to add it */ - location = time_info[curr_table->time_pos].position; - hash_table[r_list[location].hash_entry] = -1; - hash_table[hash_result] = location; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].time_pos = curr_table->time_pos; - r_list[location].addr = addr; - r_list[location].ttl = ttl; - r_list[location].last_seen = now; - r_list[location].oldest_pkt = 1; - r_list[location].last_pkts[0] = now; - r_list[location].hash_entry = hash_result; - time_info[curr_table->time_pos].time = r_list[location].last_seen; - curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; - - ans = !info->invert; - } else { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", - hash_result, - addr); -#endif - - /* Existing item found */ - location = hash_table[hash_result]; - /* We have a match on address, now to make sure it meets all requirements for a - * full match. */ - if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { - if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; - if(info->seconds && !info->hit_count) { - if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; - } - if(info->seconds && info->hit_count) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - if(info->hit_count && !info->seconds) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - } -#ifdef DEBUG - if(debug) { - if(ans) - printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); - else - printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); - } -#endif - - /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the - * current timestamp to the last_seen. */ - if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); -#endif - /* Have to update our time info */ - time_loc = r_list[location].time_pos; - time_info[time_loc].time = now; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; - } - r_list[location].time_pos = time_loc; - r_list[location].ttl = ttl; - r_list[location].last_pkts[r_list[location].oldest_pkt] = now; - r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; - r_list[location].last_seen = now; - } - /* If we have been asked to remove the entry from the list, just set it to 0 */ - if(info->check_set & IPT_RECENT_REMOVE) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); -#endif - /* Check if this is part of a collision chain */ - while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { - orig_hash_result++; - if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { - /* Found collision chain, how deep does this rabbit hole go? */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); -#endif - end_collision_chain = orig_hash_result; - } - } - if(end_collision_chain != -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); -#endif - /* Part of a collision chain, swap it with the end of the chain - * before removing. */ - r_list[hash_table[end_collision_chain]].hash_entry = hash_result; - temp = hash_table[end_collision_chain]; - hash_table[end_collision_chain] = hash_table[hash_result]; - hash_table[hash_result] = temp; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - hash_result = end_collision_chain; - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - location = hash_table[hash_result]; - hash_table[r_list[location].hash_entry] = -1; - time_loc = r_list[location].time_pos; - time_info[time_loc].time = 0; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; + if (info->check_set & IPT_RECENT_SET) + ret ^= 1; + else if (info->check_set & IPT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret ^= 1; + } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { + unsigned long t = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(t, e->stamps[i])) + continue; + if (++hits >= info->hit_count) { + ret ^= 1; + break; } - r_list[location].time_pos = time_loc; - r_list[location].last_seen = 0; - r_list[location].addr = 0; - r_list[location].ttl = 0; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].oldest_pkt = 0; - ans = !info->invert; } - spin_unlock_bh(&curr_table->list_lock); - return ans; } - spin_unlock_bh(&curr_table->list_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); -#endif - return ans; + if (info->check_set & IPT_RECENT_SET || + (info->check_set & IPT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; } -/* This function is to verify that the rule given during the userspace iptables - * command is correct. - * If the command is valid then we check if the table name referred to by the - * rule exists, if not it is created. - */ static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) +ipt_recent_checkentry(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) { - int flag = 0, c; - unsigned long *hold; const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *find_table, *last_table; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); -#endif - - /* seconds and hit_count only valid for CHECK/UPDATE */ - if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_CHECK) flag++; - if(info->check_set & IPT_RECENT_UPDATE) flag++; - - /* One and only one of these should ever be set */ - if(flag != 1) return 0; + struct recent_table *t; + unsigned i; + int ret = 0; - /* Name must be set to something */ - if(!info->name || !info->name[0]) return 0; + if (hweight8(info->check_set & + (IPT_RECENT_SET | IPT_RECENT_REMOVE | + IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) + return 0; + if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && + (info->seconds || info->hit_count)) + return 0; + if (info->name[0] == '\0' || + strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) + return 0; - /* Things look good, create a list for this if it does not exist */ - /* Lock the linked list while we play with it */ spin_lock_bh(&recent_lock); - - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); -#endif - find_table->count++; - spin_unlock_bh(&recent_lock); - return 1; + t = recent_table_lookup(info->name); + if (t != NULL) { + t->refcnt++; + ret = 1; + goto out; } - spin_unlock_bh(&recent_lock); - - /* Table with this name not found */ - /* Allocate memory for new linked list item */ - -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); - printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_ATOMIC); + if (t == NULL) + goto out; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); + if (t->proc == NULL) { + kfree(t); + goto out; } + t->proc->proc_fops = &recent_fops; + t->proc->data = t; #endif + list_add_tail(&t->list, &tables); + ret = 1; +out: + spin_unlock_bh(&recent_lock); + return ret; +} - curr_table = vmalloc(sizeof(struct recent_ip_tables)); - if(curr_table == NULL) return 0; - - spin_lock_init(&curr_table->list_lock); - curr_table->next = NULL; - curr_table->count = 1; - curr_table->time_pos = 0; - strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); - curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - /* Allocate memory for this table and the list of packets in each entry. */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", - sizeof(struct recent_ip_list)*ip_list_tot, - info->name); -#endif - - curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); - if(curr_table->table == NULL) { vfree(curr_table); return 0; } - memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", - sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#endif - - hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); -#endif - if(hold == NULL) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; - } +static void +ipt_recent_destroy(const struct xt_match *match, void *matchinfo, + unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_table *t; - /* Allocate memory for the hash table */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", - sizeof(int)*ip_list_hash_size); + spin_lock_bh(&recent_lock); + t = recent_table_lookup(info->name); + if (--t->refcnt == 0) { + list_del(&t->list); + recent_table_flush(t); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, proc_dir); #endif - - curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); - if(!curr_table->hash_table) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - - for(c = 0; c < ip_list_hash_size; c++) { - curr_table->hash_table[c] = -1; + kfree(t); } + spin_unlock_bh(&recent_lock); +} - /* Allocate memory for the time info */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", - sizeof(struct time_info_list)*ip_list_tot); -#endif +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + struct recent_table *table; + unsigned int bucket; +}; - curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); - if(!curr_table->time_info) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->time_info[c].position = c; - curr_table->time_info[c].time = 0; - } +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; - /* Put the new table in place */ spin_lock_bh(&recent_lock); - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { - find_table->count++; - spin_unlock_bh(&recent_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); -#endif - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 1; - } - if(!last_table) r_tables = curr_table; else last_table->next = curr_table; - spin_unlock_bh(&recent_lock); - -#ifdef CONFIG_PROC_FS - /* Create our proc 'status' entry. */ - curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); - if (!curr_table->status_proc) { - vfree(hold); - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); - /* Destroy the created table */ - spin_lock_bh(&recent_lock); - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; - } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) { + list_for_each_entry(e, &t->iphash[st->bucket], list) { + if (p-- == 0) + return e; } - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(curr_table->table); - vfree(curr_table); - return 0; } - - curr_table->status_proc->owner = THIS_MODULE; - curr_table->status_proc->data = curr_table; - wmb(); - curr_table->status_proc->read_proc = ip_recent_get_info; - curr_table->status_proc->write_proc = ip_recent_ctrl; -#endif /* CONFIG_PROC_FS */ - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); -#endif + return NULL; +} - return 1; +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e = v; + struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); } -/* This function is called in the event that a rule matching this module is - * removed. - * When this happens we need to check if there are no other rules matching - * the table given. If that is the case then we remove the table and clean - * up its memory. - */ -static void -destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) +static void recent_seq_stop(struct seq_file *s, void *v) { - const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *last_table; + spin_unlock_bh(&recent_lock); +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); -#endif +static int recent_seq_show(struct seq_file *seq, void *v) +{ + struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", + NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} - if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; +static struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; - /* Lock the linked list while we play with it */ - spin_lock_bh(&recent_lock); +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct seq_file *seq; + struct recent_iter_state *st; + int ret; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &recent_seq_ops); + if (ret) + kfree(st); + st->table = pde->data; + seq = file->private_data; + seq->private = st; + return ret; +} - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); -#endif +static ssize_t recent_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+255.255.255.255")], *c = buf; + u_int32_t addr; + int add; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size)) + return -EFAULT; + while (isspace(*c)) + c++; + + if (size - (c - buf) < 5) + return c - buf; + if (!strncmp(c, "clear", 5)) { + c += 5; + spin_lock_bh(&recent_lock); + recent_table_flush(t); spin_unlock_bh(&recent_lock); - return; + return c - buf; } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - /* If a table does not exist then do nothing and return */ - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + switch (*c) { + case '-': + add = 0; + c++; + break; + case '+': + c++; + default: + add = 1; + break; } + addr = in_aton(c); - curr_table->count--; - - /* If count is still non-zero then there are still rules referenceing it so we do nothing */ - if(curr_table->count) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, addr, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, addr, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); -#endif - - /* Count must be zero so we remove this table from the list */ - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); + return size; +} - /* lock to make sure any late-runners still using this after we removed it from - * the list finish up then remove everything */ - spin_lock_bh(&curr_table->list_lock); - spin_unlock_bh(&curr_table->list_lock); - -#ifdef CONFIG_PROC_FS - if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +static struct file_operations recent_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; #endif /* CONFIG_PROC_FS */ - vfree(curr_table->table[0].last_pkts); - vfree(curr_table->table); - vfree(curr_table->hash_table); - vfree(curr_table->time_info); - vfree(curr_table); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); -#endif - return; -} - -/* This is the structure we pass to ipt_register to register our - * module with iptables. - */ static struct ipt_match recent_match = { .name = "recent", - .match = match, + .match = ipt_recent_match, .matchsize = sizeof(struct ipt_recent_info), - .checkentry = checkentry, - .destroy = destroy, - .me = THIS_MODULE + .checkentry = ipt_recent_checkentry, + .destroy = ipt_recent_destroy, + .me = THIS_MODULE, }; -/* Kernel module initialization. */ static int __init ipt_recent_init(void) { - int err, count; + int err; - printk(version); -#ifdef CONFIG_PROC_FS - proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); - if(!proc_net_ipt_recent) return -ENOMEM; -#endif - - if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { - printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); - ip_list_hash_size = 0; - } - - if(!ip_list_hash_size) { - ip_list_hash_size = ip_list_tot*3; - count = 2*2; - while(ip_list_hash_size > count) count = count*2; - ip_list_hash_size = count; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); -#endif + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); err = ipt_register_match(&recent_match); +#ifdef CONFIG_PROC_FS if (err) - remove_proc_entry("ipt_recent", proc_net); + return err; + proc_dir = proc_mkdir("ipt_recent", proc_net); + if (proc_dir == NULL) { + ipt_unregister_match(&recent_match); + err = -ENOMEM; + } +#endif return err; } -/* Kernel module destruction. */ -static void __exit ipt_recent_fini(void) +static void __exit ipt_recent_exit(void) { + BUG_ON(!list_empty(&tables)); ipt_unregister_match(&recent_match); - - remove_proc_entry("ipt_recent",proc_net); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_recent", proc_net); +#endif } -/* Register our module with the kernel. */ module_init(ipt_recent_init); -module_exit(ipt_recent_fini); +module_exit(ipt_recent_exit); -- cgit v1.1 From 6442f1cf897643d4ca597f2f7d3464b765bae960 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:53 -0700 Subject: [NETFILTER]: conntrack: don't call helpers for related ICMP messages None of the existing helpers expects to get called for related ICMP packets and some even drop them if they can't parse them. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_standalone.c | 2 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 929d61f..f0cc7fe 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -417,7 +417,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && ct->helper) { + if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { unsigned int ret; ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 77d9744..8cc8e1b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) return NF_ACCEPT; help = nfct_help(ct); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 93bae36..2a71c3b 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -189,7 +189,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) goto out; help = nfct_help(ct); -- cgit v1.1 From 39a27a35c5c1b5be499a0576a35c45a011788bf8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:23:54 -0700 Subject: [NETFILTER]: conntrack: add sysctl to disable checksumming Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_udp.c | 2 +- net/ipv4/netfilter/ip_conntrack_standalone.c | 11 +++++++++++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 5 +++-- net/netfilter/nf_conntrack_proto_udp.c | 3 ++- net/netfilter/nf_conntrack_standalone.c | 11 +++++++++++ 9 files changed, 32 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index d8b14a9..23f1c50 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 062b252..c5c2ce5 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 7089986..9b2c16b 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index f0cc7fe..6cb9b98 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -564,6 +564,8 @@ extern unsigned int ip_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int ip_conntrack_checksum = 1; + static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { @@ -592,6 +594,14 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, + .procname = "ip_conntrack_checksum", + .data = &ip_conntrack_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "ip_conntrack_tcp_timeout_syn_sent", .data = &ip_ct_tcp_timeout_syn_sent, @@ -946,6 +956,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +EXPORT_SYMBOL_GPL(ip_conntrack_checksum); #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b0d361..663a73e 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 86c6703..ef18a7b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, return -NF_ACCEPT; } - if (hooknum == NF_IP6_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed\n"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 69899f2..12fb7c0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (nf_conntrack_checksum && + ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index d93edbf..ae07ebe 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + if (nf_conntrack_checksum && + ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDP)) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 408960c..e01d20d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -455,6 +455,8 @@ extern unsigned int nf_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int nf_conntrack_checksum = 1; + static struct ctl_table_header *nf_ct_sysctl_header; static ctl_table nf_ct_sysctl_table[] = { @@ -483,6 +485,14 @@ static ctl_table nf_ct_sysctl_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = NET_NF_CONNTRACK_CHECKSUM, + .procname = "nf_conntrack_checksum", + .data = &nf_conntrack_checksum, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "nf_conntrack_tcp_timeout_syn_sent", .data = &nf_ct_tcp_timeout_syn_sent, @@ -851,6 +861,7 @@ EXPORT_SYMBOL(nf_ct_proto_put); EXPORT_SYMBOL(nf_ct_l3proto_find_get); EXPORT_SYMBOL(nf_ct_l3proto_put); EXPORT_SYMBOL(nf_ct_l3protos); +EXPORT_SYMBOL_GPL(nf_conntrack_checksum); EXPORT_SYMBOL(nf_conntrack_expect_alloc); EXPORT_SYMBOL(nf_conntrack_expect_put); EXPORT_SYMBOL(nf_conntrack_expect_related); -- cgit v1.1 From 997ae831ade74bdaed4172b1c02060b9efd6e206 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Mon, 29 May 2006 18:24:20 -0700 Subject: [NETFILTER]: conntrack: add fixed timeout flag in connection tracking Add a flag in a connection status to have a non updated timeout. This permits to have connection that automatically die at a given time. Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 6 ++++++ net/netfilter/nf_conntrack_core.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a297da7..4fe9e69 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1130,6 +1130,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct, write_lock_bh(&ip_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&ip_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9b83f9..bc2bd4c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1396,6 +1396,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, write_lock_bh(&nf_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&nf_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!nf_ct_is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; -- cgit v1.1 From 3726add76643c715d437aceda320d319153b6113 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:24:39 -0700 Subject: [NETFILTER]: ctnetlink: fix NAT configuration The current configuration only allows to configure one manip and overloads conntrack status flags with netlink semantic. Signed-off-by: Patrick Mchardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 53 +++++++++++++------------------ net/netfilter/nf_conntrack_netlink.c | 53 +++++++++++++------------------ 2 files changed, 44 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 01bd7ca..af152e3 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -629,7 +629,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct ip_conntrack *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -639,7 +639,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -854,39 +854,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + } #endif } @@ -1106,7 +1097,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bd10eb9..8f27fe9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -641,7 +641,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct nf_conn *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -651,7 +651,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -866,39 +866,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + } #endif } @@ -1122,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } -- cgit v1.1 From 89f2e21883b59a6ff1e64d0b4924d06b1c6101ba Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:24:58 -0700 Subject: [NETFILTER]: ctnetlink: change table dumping not to require an unique ID Instead of using the ID to find out where to continue dumping, take a reference to the last entry dumped and try to continue there. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 32 +++++++++++++++++++++++-------- net/netfilter/nf_conntrack_netlink.c | 32 +++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index af152e3..33891bb 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -399,38 +399,54 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { DEBUGP("entered %s\n", __FUNCTION__); + if (cb->args[1]) + ip_conntrack_put((struct ip_conntrack *)cb->args[1]); return 0; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct ip_conntrack *ct = NULL; + struct ip_conntrack *ct, *last; struct ip_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, cb->args[0], *id); read_lock_bh(&ip_conntrack_lock); - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct ip_conntrack *)cb->args[1]; list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + ip_conntrack_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + ip_conntrack_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&ip_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8f27fe9..b8c7c56 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -407,6 +407,8 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); DEBUGP("entered %s\n", __FUNCTION__); return 0; } @@ -416,10 +418,9 @@ static int ctnetlink_done(struct netlink_callback *cb) static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conn *ct = NULL; + struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; @@ -427,7 +428,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0], *id); read_lock_bh(&nf_conntrack_lock); - for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct nf_conn *)cb->args[1]; list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { h = (struct nf_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -438,17 +441,30 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) * then dump everything. */ if (l3proto && L3PROTO(ct) != l3proto) continue; - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + nf_ct_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + nf_ct_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&nf_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); -- cgit v1.1 From 695ecea3299dba2239d1cb4fd4d4e4c95a5b9ce7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:14 -0700 Subject: [NETFILTER]: SNMP helper: fix debug module param type debug is the debug level, not a bool. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index c332442..d20d557 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void) module_init(ip_nat_snmp_basic_init); module_exit(ip_nat_snmp_basic_fini); -module_param(debug, bool, 0600); +module_param(debug, int, 0600); -- cgit v1.1 From 7d8c50181778b6ba10c2bba9a2f22db9493bb245 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:38 -0700 Subject: [NETFILTER]: FTP helper: search optimization Instead of skipping search entries for the wrong direction simply index them by direction. Based on patch by Pablo Neira Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_ftp.c | 77 +++++++++++++++++++---------------- net/netfilter/nf_conntrack_ftp.c | 77 +++++++++++++++++++---------------- 2 files changed, 86 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3e542bf..4dcf526 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); static const struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, u_int32_t[], char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb, array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, (*pskb)->len - dataoff, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, array, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (ip_nat_ftp_hook) - ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e38a4b5..11d3be2 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -67,37 +67,48 @@ static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, char); static struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -492,17 +503,15 @@ static int help(struct sk_buff **pskb, memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all)); - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, datalen, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, &cmd, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -512,7 +521,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -597,7 +606,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (nf_nat_ftp_hook) - ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = nf_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ -- cgit v1.1 From c95261693467f0aeac7fafa69860ddfb02bc12f8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:58 -0700 Subject: [NETFILTER]: amanda helper: convert to textsearch infrastructure When a port number within a packet is replaced by a differently sized number only the packet is resized, but not the copy of the data. Following port numbers are rewritten based on their offsets within the copy, leading to packet corruption. Convert the amanda helper to the textsearch infrastructure to avoid the copy entirely. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 2 + net/ipv4/netfilter/ip_conntrack_amanda.c | 143 ++++++++++++++++++++----------- 2 files changed, 96 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d407253..1540e22 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -142,6 +142,8 @@ config IP_NF_TFTP config IP_NF_AMANDA tristate "Amanda backup protocol support" depends on IP_NF_CONNTRACK + select TEXTSEARCH + select TEXTSEARCH_KMP help If you are running the Amanda backup package on this machine or machines that will be MASQUERADED through this diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index a604b1c..0a7bd7f 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -17,33 +17,29 @@ * this value. * */ - -#include #include #include -#include -#include #include +#include +#include +#include +#include #include -#include -#include +#include #include #include static unsigned int master_timeout = 300; +static char *ts_algo = "kmp"; MODULE_AUTHOR("Brian J. Murrell "); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); - -static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; - -/* This is slow, but it's simple. --RR */ -static char *amanda_buffer; -static DEFINE_SPINLOCK(amanda_buffer_lock); +module_param(ts_algo, charp, 0400); +MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, struct ip_conntrack_expect *exp); EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); +enum amanda_strings { + SEARCH_CONNECT, + SEARCH_NEWLINE, + SEARCH_DATA, + SEARCH_MESG, + SEARCH_INDEX, +}; + +static struct { + char *string; + size_t len; + struct ts_config *ts; +} search[] = { + [SEARCH_CONNECT] = { + .string = "CONNECT ", + .len = 8, + }, + [SEARCH_NEWLINE] = { + .string = "\n", + .len = 1, + }, + [SEARCH_DATA] = { + .string = "DATA ", + .len = 5, + }, + [SEARCH_MESG] = { + .string = "MESG ", + .len = 5, + }, + [SEARCH_INDEX] = { + .string = "INDEX ", + .len = 6, + }, +}; + static int help(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { + struct ts_state ts; struct ip_conntrack_expect *exp; - char *data, *data_limit, *tmp; - unsigned int dataoff, i; + unsigned int dataoff, start, stop, off, i; + char pbuf[sizeof("65535")], *tmp; u_int16_t port, len; int ret = NF_ACCEPT; @@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb, return NF_ACCEPT; } - spin_lock_bh(&amanda_buffer_lock); - skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); - data = amanda_buffer; - data_limit = amanda_buffer + (*pskb)->len - dataoff; - *data_limit = '\0'; - - /* Search for the CONNECT string */ - data = strstr(data, "CONNECT "); - if (!data) + memset(&ts, 0, sizeof(ts)); + start = skb_find_text(*pskb, dataoff, (*pskb)->len, + search[SEARCH_CONNECT].ts, &ts); + if (start == UINT_MAX) goto out; - data += strlen("CONNECT "); + start += dataoff + search[SEARCH_CONNECT].len; - /* Only search first line. */ - if ((tmp = strchr(data, '\n'))) - *tmp = '\0'; + memset(&ts, 0, sizeof(ts)); + stop = skb_find_text(*pskb, start, (*pskb)->len, + search[SEARCH_NEWLINE].ts, &ts); + if (stop == UINT_MAX) + goto out; + stop += start; - for (i = 0; i < ARRAY_SIZE(conns); i++) { - char *match = strstr(data, conns[i]); - if (!match) + for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { + memset(&ts, 0, sizeof(ts)); + off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); + if (off == UINT_MAX) continue; - tmp = data = match + strlen(conns[i]); - port = simple_strtoul(data, &data, 10); - len = data - tmp; + off += start + search[i].len; + + len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); + if (skb_copy_bits(*pskb, off, pbuf, len)) + break; + pbuf[len] = '\0'; + + port = simple_strtoul(pbuf, &tmp, 10); + len = tmp - pbuf; if (port == 0 || len > 5) break; @@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb, exp->mask.dst.u.tcp.port = 0xFFFF; if (ip_nat_amanda_hook) - ret = ip_nat_amanda_hook(pskb, ctinfo, - tmp - amanda_buffer, + ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff, len, exp); else if (ip_conntrack_expect_related(exp) != 0) ret = NF_DROP; @@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb, } out: - spin_unlock_bh(&amanda_buffer_lock); return ret; } static struct ip_conntrack_helper amanda_helper = { - .max_expected = ARRAY_SIZE(conns), + .max_expected = 3, .timeout = 180, .me = THIS_MODULE, .help = help, @@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit ip_conntrack_amanda_fini(void) { + int i; + ip_conntrack_helper_unregister(&amanda_helper); - kfree(amanda_buffer); + for (i = 0; i < ARRAY_SIZE(search); i++) + textsearch_destroy(search[i].ts); } static int __init ip_conntrack_amanda_init(void) { - int ret; - - amanda_buffer = kmalloc(65536, GFP_KERNEL); - if (!amanda_buffer) - return -ENOMEM; - - ret = ip_conntrack_helper_register(&amanda_helper); - if (ret < 0) { - kfree(amanda_buffer); - return ret; + int ret, i; + + ret = -ENOMEM; + for (i = 0; i < ARRAY_SIZE(search); i++) { + search[i].ts = textsearch_prepare(ts_algo, search[i].string, + search[i].len, + GFP_KERNEL, TS_AUTOLOAD); + if (search[i].ts == NULL) + goto err; } + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) + goto err; return 0; - +err: + for (; i >= 0; i--) { + if (search[i].ts) + textsearch_destroy(search[i].ts); + } + return ret; } module_init(ip_conntrack_amanda_init); -- cgit v1.1 From c0d4cfd96dd0cc0dbf49435898808b5553af4822 Mon Sep 17 00:00:00 2001 From: Jing Min Zhao Date: Mon, 29 May 2006 18:26:27 -0700 Subject: [NETFILTER]: H.323 helper: Add support for Call Forwarding Signed-off-by: Jing Min Zhao Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 8 +- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 112 +++++++++++++++++++++ .../netfilter/ip_conntrack_helper_h323_types.c | 6 +- net/ipv4/netfilter/ip_nat_helper_h323.c | 77 ++++++++++++++ 4 files changed, 196 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1540e22..f86aeda 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -183,10 +183,10 @@ config IP_NF_H323 With this module you can support H.323 on a connection tracking/NAT firewall. - This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP - and T.120 based data and applications including audio, video, FAX, - chat, whiteboard, file transfer, etc. For more information, please - see http://nath323.sourceforge.net/. + This module supports RAS, Fast Start, H.245 Tunnelling, Call + Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, + whiteboard, file transfer, etc. For more information, please + visit http://nath323.sourceforge.net/. If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 518f581..3052468 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #if 0 #define DEBUGP printk @@ -38,6 +40,13 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); +static char *internal_net = NULL; +static u_int32_t internal_net_addr = 0; +static u_int32_t internal_net_mask = 0; +module_param(internal_net, charp, 0600); +MODULE_PARM_DESC(internal_net, "specify your internal network using format " + "address/mask. this is used by call forwarding support"); + /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, @@ -77,6 +86,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, TransportAddress * addr, u_int16_t port, struct ip_conntrack_expect * exp); +int (*nat_callforwarding_hook) (struct sk_buff ** pskb, + struct ip_conntrack * ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect * exp); int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, enum ip_conntrack_info ctinfo, @@ -683,6 +698,76 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, return ret; } +/* Forwarding declaration */ +void ip_conntrack_q931_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this); + +/****************************************************************************/ +static int expect_callforwarding(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + u_int32_t ip; + u_int16_t port; + struct ip_conntrack_expect *exp = NULL; + + /* Read alternativeAddress */ + if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) + return 0; + + /* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ + if (internal_net && + ((ip & internal_net_mask) == internal_net_addr) == + ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) == + internal_net_addr)) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } + + /* Create expect for the second call leg */ + if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) + return -1; + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ip; + exp->tuple.dst.u.tcp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + exp->flags = 0; + + if (ct->tuplehash[dir].tuple.src.ip != + ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) { + /* Need NAT */ + ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff, + addr, port, exp); + } else { /* Conntrack only */ + exp->expectfn = ip_conntrack_q931_expect; + + if (ip_conntrack_expect_related(exp) == 0) { + DEBUGP("ip_ct_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), + ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), + ntohs(exp->tuple.dst.u.tcp.port)); + } else + ret = -1; + } + + ip_conntrack_expect_put(exp); + + return ret; +} + /****************************************************************************/ static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, @@ -878,6 +963,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, DEBUGP("ip_ct_q931: Facility\n"); + if (facility->reason.choice == eFacilityReason_callForwarded) { + if (facility->options & eFacility_UUIE_alternativeAddress) + return expect_callforwarding(pskb, ct, ctinfo, data, + dataoff, + &facility-> + alternativeAddress); + return 0; + } + if (facility->options & eFacility_UUIE_h245Address) { ret = expect_h245(pskb, ct, ctinfo, data, dataoff, &facility->h245Address); @@ -1668,6 +1762,7 @@ static void fini(void) static int __init init(void) { int ret; + char *p; h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) @@ -1678,6 +1773,22 @@ static int __init init(void) return ret; } + if (internal_net) { + if ((p = strchr(internal_net, '/'))) + *p++ = 0; + if (isdigit(internal_net[0])) { + internal_net_addr = in_aton(internal_net); + if (p && isdigit(p[0])) + internal_net_mask = in_aton(p); + else + internal_net_mask = 0xffffffff; + internal_net_addr &= internal_net_mask; + } + DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n", + NIPQUAD(internal_net_addr), + NIPQUAD(internal_net_mask)); + } + DEBUGP("ip_ct_h323: init success\n"); return 0; } @@ -1696,6 +1807,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook); EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); EXPORT_SYMBOL_GPL(nat_t120_hook); EXPORT_SYMBOL_GPL(nat_h245_hook); +EXPORT_SYMBOL_GPL(nat_callforwarding_hook); EXPORT_SYMBOL_GPL(nat_q931_hook); MODULE_AUTHOR("Jing Min Zhao "); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c index 022c47b..4b35961 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c @@ -1,4 +1,4 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 * * Copyright (c) 2006 Jing Min Zhao * @@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ static field_t _Facility_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, - {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, - _TransportAddress}, + {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _Facility_UUIE_alternativeAliasAddress}, {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c index d45663d..419b878 100644 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ b/net/ipv4/netfilter/ip_nat_helper_h323.c @@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, } /****************************************************************************/ +static void ip_nat_callforwarding_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(new->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; + + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = this->saved_proto; + range.min_ip = range.max_ip = this->saved_ip; + + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + + ip_conntrack_q931_expect(new, this); +} + +/****************************************************************************/ +static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect *exp) +{ + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port; + + /* Set expectations for NAT */ + exp->saved_ip = exp->tuple.dst.ip; + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = ip_nat_callforwarding_expect; + exp->dir = !dir; + + /* Try to get same port: if not, try to change it. */ + for (nated_port = port; nated_port != 0; nated_port++) { + exp->tuple.dst.u.tcp.port = htons(nated_port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + printk("ip_nat_q931: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (!set_h225_addr(pskb, data, dataoff, addr, + ct->tuplehash[!dir].tuple.dst.ip, + nated_port) == 0) { + ip_conntrack_unexpect_related(exp); + return -1; + } + + /* Success */ + DEBUGP("ip_nat_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + +/****************************************************************************/ static int __init init(void) { BUG_ON(set_h245_addr_hook != NULL); @@ -496,6 +570,7 @@ static int __init init(void) BUG_ON(nat_rtp_rtcp_hook != NULL); BUG_ON(nat_t120_hook != NULL); BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); BUG_ON(nat_q931_hook != NULL); set_h245_addr_hook = set_h245_addr; @@ -505,6 +580,7 @@ static int __init init(void) nat_rtp_rtcp_hook = nat_rtp_rtcp; nat_t120_hook = nat_t120; nat_h245_hook = nat_h245; + nat_callforwarding_hook = nat_callforwarding; nat_q931_hook = nat_q931; DEBUGP("ip_nat_h323: init success\n"); @@ -521,6 +597,7 @@ static void __exit fini(void) nat_rtp_rtcp_hook = NULL; nat_t120_hook = NULL; nat_h245_hook = NULL; + nat_callforwarding_hook = NULL; nat_q931_hook = NULL; synchronize_net(); } -- cgit v1.1 From e44ab66a75e20c02193440a5e27c16c91630109b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:26:47 -0700 Subject: [NETFILTER]: H.323 helper: replace internal_net_addr parameter by routing-based heuristic Call Forwarding doesn't need to create an expectation if both peers can reach each other without our help. The internal_net_addr parameter lets the user explicitly specify a single network where this is true, but is not very flexible and even fails in the common case that calls will both be forwarded to outside parties and inside parties. Use an optional heuristic based on routing instead, the assumption is that if bpth the outgoing device and the gateway are equal, both peers can reach each other directly. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 57 +++++++++++++-------------- 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 3052468..0665674 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -40,12 +40,11 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); -static char *internal_net = NULL; -static u_int32_t internal_net_addr = 0; -static u_int32_t internal_net_mask = 0; -module_param(internal_net, charp, 0600); -MODULE_PARM_DESC(internal_net, "specify your internal network using format " - "address/mask. this is used by call forwarding support"); +static int callforward_filter = 1; +module_param(callforward_filter, bool, 0600); +MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " + "if both endpoints are on different sides " + "(determined by routing information)"); /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, @@ -721,12 +720,28 @@ static int expect_callforwarding(struct sk_buff **pskb, /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ - if (internal_net && - ((ip & internal_net_mask) == internal_net_addr) == - ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) == - internal_net_addr)) { - DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); - return 0; + if (callforward_filter) { + struct rtable *rt1, *rt2; + struct flowi fl1 = { + .fl4_dst = ip, + }; + struct flowi fl2 = { + .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, + }; + + if (ip_route_output_key(&rt1, &fl1) == 0) { + if (ip_route_output_key(&rt2, &fl2) == 0) { + if (rt1->rt_gateway == rt2->rt_gateway && + rt1->u.dst.dev == rt2->u.dst.dev) + ret = 1; + dst_release(&rt2->u.dst); + } + dst_release(&rt1->u.dst); + } + if (ret) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } } /* Create expect for the second call leg */ @@ -1762,7 +1777,6 @@ static void fini(void) static int __init init(void) { int ret; - char *p; h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) @@ -1772,23 +1786,6 @@ static int __init init(void) fini(); return ret; } - - if (internal_net) { - if ((p = strchr(internal_net, '/'))) - *p++ = 0; - if (isdigit(internal_net[0])) { - internal_net_addr = in_aton(internal_net); - if (p && isdigit(p[0])) - internal_net_mask = in_aton(p); - else - internal_net_mask = 0xffffffff; - internal_net_addr &= internal_net_mask; - } - DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n", - NIPQUAD(internal_net_addr), - NIPQUAD(internal_net_mask)); - } - DEBUGP("ip_ct_h323: init success\n"); return 0; } -- cgit v1.1 From ae5b7d8ba2c28d7d9835856fe0ca5f6ec95ea768 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:27:09 -0700 Subject: [NETFILTER]: Add SIP connection tracking helper Add SIP connection tracking helper. Originally written by Christian Hentschel , some cleanup, minor fixes and bidirectional SIP support added by myself. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 18 ++ net/ipv4/netfilter/Makefile | 2 + net/ipv4/netfilter/ip_conntrack_sip.c | 471 ++++++++++++++++++++++++++++++++++ net/ipv4/netfilter/ip_nat_sip.c | 249 ++++++++++++++++++ 4 files changed, 740 insertions(+) create mode 100644 net/ipv4/netfilter/ip_conntrack_sip.c create mode 100644 net/ipv4/netfilter/ip_nat_sip.c (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f86aeda..ff4b118 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -191,6 +191,18 @@ config IP_NF_H323 If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. +config IP_NF_SIP + tristate "SIP protocol support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + SIP is an application-layer control protocol that can establish, + modify, and terminate multimedia sessions (conferences) such as + Internet telephony calls. With the ip_conntrack_sip and + the ip_nat_sip modules you can support the protocol on a connection + tracking/NATing firewall. + + To compile it as a module, choose M here. If unsure, say Y. + config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help @@ -503,6 +515,12 @@ config IP_NF_NAT_H323 default IP_NF_NAT if IP_NF_H323=y default m if IP_NF_H323=m +config IP_NF_NAT_SIP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_SIP=y + default m if IP_NF_SIP=m + # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 461cb1e..3ded4a3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o +obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o # NAT helpers @@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o +obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c new file mode 100644 index 0000000..fc87ce0 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_sip.c @@ -0,0 +1,471 @@ +/* SIP extension for IP connection tracking. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_conntrack_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP connection tracking helper"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of sip servers"); + +static unsigned int sip_timeout = SIP_TIMEOUT; +module_param(sip_timeout, uint, 0600); +MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); + +unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr); +EXPORT_SYMBOL_GPL(ip_nat_sip_hook); + +unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr); +EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); + +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo); +EXPORT_SYMBOL_GPL(ct_sip_get_info); + + +static int digits_len(const char *dptr, const char *limit, int *shift); +static int epaddr_len(const char *dptr, const char *limit, int *shift); +static int skp_digits_len(const char *dptr, const char *limit, int *shift); +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); + +struct sip_header_nfo ct_sip_hdrs[] = { + { /* Via header */ + .lname = "Via:", + .lnlen = sizeof("Via:") - 1, + .sname = "\r\nv:", + .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ + .ln_str = "UDP ", + .ln_strlen = sizeof("UDP ") - 1, + .match_len = epaddr_len, + }, + { /* Contact header */ + .lname = "Contact:", + .lnlen = sizeof("Contact:") - 1, + .sname = "\r\nm:", + .snlen = sizeof("\r\nm:") - 1, + .ln_str = "sip:", + .ln_strlen = sizeof("sip:") - 1, + .match_len = skp_epaddr_len + }, + { /* Content length header */ + .lname = "Content-Length:", + .lnlen = sizeof("Content-Length:") - 1, + .sname = "\r\nl:", + .snlen = sizeof("\r\nl:") - 1, + .ln_str = ":", + .ln_strlen = sizeof(":") - 1, + .match_len = skp_digits_len + }, + { /* SDP media info */ + .lname = "\nm=", + .lnlen = sizeof("\nm=") - 1, + .sname = "\rm=", + .snlen = sizeof("\rm=") - 1, + .ln_str = "audio ", + .ln_strlen = sizeof("audio ") - 1, + .match_len = digits_len + }, + { /* SDP owner address*/ + .lname = "\no=", + .lnlen = sizeof("\no=") - 1, + .sname = "\ro=", + .snlen = sizeof("\ro=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* SDP connection info */ + .lname = "\nc=", + .lnlen = sizeof("\nc=") - 1, + .sname = "\rc=", + .snlen = sizeof("\rc=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* Requests headers */ + .lname = "sip:", + .lnlen = sizeof("sip:") - 1, + .sname = "sip:", + .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */ + .ln_str = "@", + .ln_strlen = sizeof("@") - 1, + .match_len = epaddr_len + }, + { /* SDP version header */ + .lname = "\nv=", + .lnlen = sizeof("\nv=") - 1, + .sname = "\rv=", + .snlen = sizeof("\rv=") - 1, + .ln_str = "=", + .ln_strlen = sizeof("=") - 1, + .match_len = digits_len + } +}; +EXPORT_SYMBOL_GPL(ct_sip_hdrs); + +/* get line lenght until first CR or LF seen. */ +int ct_sip_lnlen(const char *line, const char *limit) +{ + const char *k = line; + + while ((line <= limit) && (*line == '\r' || *line == '\n')) + line++; + + while (line <= limit) { + if (*line == '\r' || *line == '\n') + break; + line++; + } + return line - k; +} +EXPORT_SYMBOL_GPL(ct_sip_lnlen); + +/* Linear string search, case sensitive. */ +const char *ct_sip_search(const char *needle, const char *haystack, + size_t needle_len, size_t haystack_len) +{ + const char *limit = haystack + (haystack_len - needle_len); + + while (haystack <= limit) { + if (memcmp(haystack, needle, needle_len) == 0) + return haystack; + haystack++; + } + return NULL; +} +EXPORT_SYMBOL_GPL(ct_sip_search); + +static int digits_len(const char *dptr, const char *limit, int *shift) +{ + int len = 0; + while (dptr <= limit && isdigit(*dptr)) { + dptr++; + len++; + } + return len; +} + +/* get digits lenght, skiping blank spaces. */ +static int skp_digits_len(const char *dptr, const char *limit, int *shift) +{ + for (; dptr <= limit && *dptr == ' '; dptr++) + (*shift)++; + + return digits_len(dptr, limit, shift); +} + +/* Simple ipaddr parser.. */ +static int parse_ipaddr(const char *cp, const char **endp, + u_int32_t *ipaddr, const char *limit) +{ + unsigned long int val; + int i, digit = 0; + + for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { + digit = 0; + if (!isdigit(*cp)) + break; + + val = simple_strtoul(cp, (char **)&cp, 10); + if (val > 0xFF) + return -1; + + ((u_int8_t *)ipaddr)[i] = val; + digit = 1; + + if (*cp != '.') + break; + cp++; + } + if (!digit) + return -1; + + if (endp) + *endp = cp; + + return 0; +} + +/* skip ip address. returns it lenght. */ +static int epaddr_len(const char *dptr, const char *limit, int *shift) +{ + const char *aux = dptr; + u_int32_t ip; + + if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { + DEBUGP("ip: %s parse failed.!\n", dptr); + return 0; + } + + /* Port number */ + if (*dptr == ':') { + dptr++; + dptr += digits_len(dptr, limit, shift); + } + return dptr - aux; +} + +/* get address length, skiping user info. */ +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) +{ + int s = *shift; + + for (; dptr <= limit && *dptr != '@'; dptr++) + (*shift)++; + + if (*dptr == '@') { + dptr++; + (*shift)++; + } else + *shift = s; + + return epaddr_len(dptr, limit, shift); +} + +/* Returns 0 if not found, -1 error parsing. */ +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo) +{ + const char *limit, *aux, *k = dptr; + int shift = 0; + + limit = dptr + (dlen - hnfo->lnlen); + + while (dptr <= limit) { + if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && + (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { + dptr++; + continue; + } + aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, + ct_sip_lnlen(dptr, limit)); + if (!aux) { + DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, + hnfo->lname); + return -1; + } + aux += hnfo->ln_strlen; + + *matchlen = hnfo->match_len(aux, limit, &shift); + if (!*matchlen) + return -1; + + *matchoff = (aux - k) + shift; + + DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, + *matchlen); + return 1; + } + DEBUGP("%s header not found.\n", hnfo->lname); + return 0; +} + +static int set_expected_rtp(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u_int32_t ipaddr, u_int16_t port, + const char *dptr) +{ + struct ip_conntrack_expect *exp; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + int ret; + + exp = ip_conntrack_expect_alloc(ct); + if (exp == NULL) + return NF_DROP; + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.udp.port = 0; + exp->tuple.dst.ip = ipaddr; + exp->tuple.dst.u.udp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_UDP; + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.udp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.udp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + + exp->expectfn = NULL; + exp->flags = 0; + + if (ip_nat_sdp_hook) + ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr); + else { + if (ip_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + ip_conntrack_expect_put(exp); + + return ret; +} + +static int sip_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + int ret = NF_ACCEPT; + int matchoff, matchlen; + u_int32_t ipaddr; + u_int16_t port; + + /* No Data ? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + DEBUGP("skb->len = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + ip_ct_refresh(ct, *pskb, sip_timeout * HZ); + + if (!skb_is_nonlinear(*pskb)) + dptr = (*pskb)->data + dataoff; + else { + DEBUGP("Copy of skbuff not supported yet.\n"); + goto out; + } + + if (ip_nat_sip_hook) { + if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) { + ret = NF_DROP; + goto out; + } + } + + /* After this point NAT, could have mangled skb, so + we need to recalculate payload lenght. */ + datalen = (*pskb)->len - dataoff; + + if (datalen < (sizeof("SIP/2.0 200") - 1)) + goto out; + + /* RTP info only in some SDP pkts */ + if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && + memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { + goto out; + } + /* Get ip and port address from SDP packet. */ + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_CONNECTION]) > 0) { + + /* We'll drop only if there are parse problems. */ + if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, + dptr + datalen) < 0) { + ret = NF_DROP; + goto out; + } + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_MEDIA]) > 0) { + + port = simple_strtoul(dptr + matchoff, NULL, 10); + if (port < 1024) { + ret = NF_DROP; + goto out; + } + ret = set_expected_rtp(pskb, ct, ctinfo, + ipaddr, port, dptr); + } + } +out: + return ret; +} + +static struct ip_conntrack_helper sip[MAX_PORTS]; +static char sip_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", ports[i]); + ip_conntrack_helper_unregister(&sip[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = SIP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); + + sip[i].tuple.dst.protonum = IPPROTO_UDP; + sip[i].tuple.src.u.udp.port = htons(ports[i]); + sip[i].mask.src.u.udp.port = 0xFFFF; + sip[i].mask.dst.protonum = 0xFF; + sip[i].max_expected = 1; + sip[i].timeout = 3 * 60; /* 3 minutes */ + sip[i].me = THIS_MODULE; + sip[i].help = sip_help; + + tmpname = &sip_names[i][0]; + if (ports[i] == SIP_PORT) + sprintf(tmpname, "sip"); + else + sprintf(tmpname, "sip-%d", i); + sip[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(&sip[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c new file mode 100644 index 0000000..6ffba63 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_sip.c @@ -0,0 +1,249 @@ +/* SIP extension for UDP NAT alteration. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_nat_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +extern struct sip_header_nfo ct_sip_hdrs[]; + +static unsigned int mangle_sip_packet(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr, size_t dlen, + char *buffer, int bufflen, + struct sip_header_nfo *hnfo) +{ + unsigned int matchlen, matchoff; + + if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0) + return 0; + + if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, buffer, bufflen)) + return 0; + + /* We need to reload this. Thanks Patrick. */ + *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + return 1; +} + +static unsigned int ip_nat_sip(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int bufflen, dataoff; + u_int32_t ip; + u_int16_t port; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + ip = ct->tuplehash[!dir].tuple.dst.ip; + port = ct->tuplehash[!dir].tuple.dst.u.udp.port; + bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port)); + + /* short packet ? */ + if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1)) + return 0; + + /* Basic rules: requests and responses. */ + if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) { + const char *aux; + + if ((ctinfo) < IP_CT_IS_REPLY) { + mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* This search should ignore case, but later.. */ + aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1, + (*pskb)->len - dataoff); + if (!aux) + return 0; + + if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"), + ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff))) + return 1; + + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + } + if ((ctinfo) < IP_CT_IS_REPLY) { + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* Mangle Contact if exists only. - watch udp_nat_mangle()! */ + mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + /* This mangle requests headers. */ + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + ct_sip_lnlen(*dptr, + *dptr + (*pskb)->len - dataoff), + buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]); +} + +static int mangle_content_len(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char *dptr) +{ + unsigned int dataoff, matchoff, matchlen; + char buffer[sizeof("65536")]; + int bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Get actual SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) { + + /* since ct_sip_get_info() give us a pointer passing 'v=' + we need to add 2 bytes in this count. */ + int c_len = (*pskb)->len - dataoff - matchoff + 2; + + /* Now, update SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) { + + bufflen = sprintf(buffer, "%u", c_len); + + return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, + buffer, bufflen); + } + } + return 0; +} + +static unsigned int mangle_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + u_int32_t newip, u_int16_t port, + const char *dptr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int dataoff, bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Mangle owner and contact info. */ + bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_OWNER])) + return 0; + + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION])) + return 0; + + /* Mangle media port. */ + bufflen = sprintf(buffer, "%u", port); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_MEDIA])) + return 0; + + return mangle_content_len(pskb, ctinfo, ct, dptr); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr) +{ + struct ip_conntrack *ct = exp->master; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int32_t newip; + u_int16_t port; + + DEBUGP("ip_nat_sdp():\n"); + + /* Connection will come from reply */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + + exp->tuple.dst.ip = newip; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { + exp->tuple.dst.u.udp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) + return NF_DROP; + + if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_sip_hook = NULL; + ip_nat_sdp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_sip_hook); + BUG_ON(ip_nat_sdp_hook); + ip_nat_sip_hook = ip_nat_sip; + ip_nat_sdp_hook = ip_nat_sdp; + return 0; +} + +module_init(init); +module_exit(fini); -- cgit v1.1 From c45fb1089e714146206d7e295ff337893424c874 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 29 May 2006 18:27:32 -0700 Subject: [NETFILTER]: PPTP helper: fixup gre_keymap_lookup() return type GRE keys are 16-bit wide. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_gre.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 5679479..21ee124 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, } /* look up the source key for a given tuple */ -static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) +static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) { struct ip_ct_gre_keymap *km; - u_int32_t key = 0; + __be16 key = 0; read_lock_bh(&ip_ct_gre_lock); km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, @@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, struct ip_conntrack_tuple *tuple) { struct gre_hdr_pptp _pgrehdr, *pgrehdr; - u_int32_t srckey; + __be16 srckey; struct gre_hdr _grehdr, *grehdr; /* first only delinearize old RFC1701 GRE header */ -- cgit v1.1 From 2f45c340e09242641d4f11498c3be48b35abb926 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 2 Jun 2006 16:29:20 -0700 Subject: [LLC]: Fix double receive of SKB. Oops fix from Stephen: remove duplicate rcv() calls. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/llc/llc_input.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 32cf8ac..94d2368 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -176,7 +176,6 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); - rcv(skb, dev, pt, orig_dev); } dest = llc_pdu_type(skb); if (unlikely(!dest || !llc_type_handlers[dest - 1])) -- cgit v1.1 From 7c106d7e782bd4805f39da30e81018f861b4b8c5 Mon Sep 17 00:00:00 2001 From: Wong Hoi Sing Edison Date: Mon, 5 Jun 2006 17:27:58 -0700 Subject: [TCP]: TCP Low Priority congestion control TCP Low Priority is a distributed algorithm whose goal is to utilize only the excess network bandwidth as compared to the ``fair share`` of bandwidth as targeted by TCP. Available from: http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf Original Author: Aleksandar Kuzmanovic See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. As of 2.6.13, Linux supports pluggable congestion control algorithms. Due to the limitation of the API, we take the following changes from the original TCP-LP implementation: o We use newReno in most core CA handling. Only add some checking within cong_avoid. o Error correcting in remote HZ, therefore remote HZ will be keeped on checking and updating. o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne OWD have a similar meaning as RTT. Also correct the buggy formular. o Handle reaction for Early Congestion Indication (ECI) within pkts_acked, as mentioned within pseudo code. o OWD is handled in relative format, where local time stamp will in tcp_time_stamp format. Port from 2.4.19 to 2.6.16 as module by: Wong Hoi Sing Edison Hung Hing Lun Signed-off-by: Wong Hoi Sing Edison Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_lp.c | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 349 insertions(+) create mode 100644 net/ipv4/tcp_lp.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 35eb70b..ae85149 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -550,6 +550,16 @@ config TCP_CONG_SCALABLE properties, though is known to have fairness issues. See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ +config TCP_CONG_LP + tristate "TCP Low Priority" + depends on EXPERIMENTAL + default n + ---help--- + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utiliza only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4cc9448..58a7a46 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o +obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 0000000..1f977b6 --- /dev/null +++ b/net/ipv4/tcp_lp.c @@ -0,0 +1,338 @@ +/* + * TCP Low Priority (TCP-LP) + * + * TCP Low Priority is a distributed algorithm whose goal is to utilize only + * the excess network bandwidth as compared to the ``fair share`` of + * bandwidth as targeted by TCP. Available from: + * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * + * Original Author: + * Aleksandar Kuzmanovic + * + * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. + * As of 2.6.13, Linux supports pluggable congestion control algorithms. + * Due to the limitation of the API, we take the following changes from + * the original TCP-LP implementation: + * o We use newReno in most core CA handling. Only add some checking + * within cong_avoid. + * o Error correcting in remote HZ, therefore remote HZ will be keeped + * on checking and updating. + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + * OWD have a similar meaning as RTT. Also correct the buggy formular. + * o Handle reaction for Early Congestion Indication (ECI) within + * pkts_acked, as mentioned within pseudo code. + * o OWD is handled in relative format, where local time stamp will in + * tcp_time_stamp format. + * + * Port from 2.4.19 to 2.6.16 as module by: + * Wong Hoi Sing Edison + * Hung Hing Lun + * + * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ + */ + +#include +#include +#include + +/* resolution of owd */ +#define LP_RESOL 1000 + +/** + * enum tcp_lp_state + * @LP_VALID_RHZ: is remote HZ valid? + * @LP_VALID_OWD: is OWD valid? + * @LP_WITHIN_THR: are we within threshold? + * @LP_WITHIN_INF: are we within inference? + * + * TCP-LP's state flags. + * We create this set of state flag mainly for debugging. + */ +enum tcp_lp_state { + LP_VALID_RHZ = (1 << 0), + LP_VALID_OWD = (1 << 1), + LP_WITHIN_THR = (1 << 3), + LP_WITHIN_INF = (1 << 4), +}; + +/** + * struct lp + * @flag: TCP-LP state flag + * @sowd: smoothed OWD << 3 + * @owd_min: min OWD + * @owd_max: max OWD + * @owd_max_rsv: resrved max owd + * @remote_hz: estimated remote HZ + * @remote_ref_time: remote reference time + * @local_ref_time: local reference time + * @last_drop: time for last active drop + * @inference: current inference + * + * TCP-LP's private struct. + * We get the idea from original TCP-LP implementation where only left those we + * found are really useful. + */ +struct lp { + u32 flag; + u32 sowd; + u32 owd_min; + u32 owd_max; + u32 owd_max_rsv; + u32 remote_hz; + u32 remote_ref_time; + u32 local_ref_time; + u32 last_drop; + u32 inference; +}; + +/** + * tcp_lp_init + * + * Init all required variables. + * Clone the handling from Vegas module implementation. + */ +static void tcp_lp_init(struct sock *sk) +{ + struct lp *lp = inet_csk_ca(sk); + + lp->flag = 0; + lp->sowd = 0; + lp->owd_min = 0xffffffff; + lp->owd_max = 0; + lp->owd_max_rsv = 0; + lp->remote_hz = 0; + lp->remote_ref_time = 0; + lp->local_ref_time = 0; + lp->last_drop = 0; + lp->inference = 0; +} + +/** + * tcp_lp_cong_avoid + * + * Implementation of cong_avoid. + * Will only call newReno CA when away from inference. + * From TCP-LP's paper, this will be handled in additive increasement. + */ +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + struct lp *lp = inet_csk_ca(sk); + + if (!(lp->flag & LP_WITHIN_INF)) + tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); +} + +/** + * tcp_lp_remote_hz_estimator + * + * Estimate remote HZ. + * We keep on updating the estimated value, where original TCP-LP + * implementation only guest it for once and use forever. + */ +static u32 tcp_lp_remote_hz_estimator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ + s64 m = 0; + + /* not yet record reference time + * go away!! record it before come back!! */ + if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) + goto out; + + /* we can't calc remote HZ with no different!! */ + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time + || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + + m = HZ * (tp->rx_opt.rcv_tsval - + lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - + lp->local_ref_time); + if (m < 0) + m = -m; + + if (rhz != 0) { + m -= rhz >> 6; /* m is now error in remote HZ est */ + rhz += m; /* 63/64 old + 1/64 new */ + } else + rhz = m << 6; + + /* record time for successful remote HZ calc */ + lp->flag |= LP_VALID_RHZ; + + out: + /* record reference time stamp */ + lp->remote_ref_time = tp->rx_opt.rcv_tsval; + lp->local_ref_time = tp->rx_opt.rcv_tsecr; + + return rhz >> 6; +} + +/** + * tcp_lp_owd_calculator + * + * Calculate one way delay (in relative format). + * Original implement OWD as minus of remote time difference to local time + * difference directly. As this time difference just simply equal to RTT, when + * the network status is stable, remote RTT will equal to local RTT, and result + * OWD into zero. + * It seems to be a bug and so we fixed it. + */ +static u32 tcp_lp_owd_calculator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 owd = 0; + + lp->remote_hz = tcp_lp_remote_hz_estimator(sk); + + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - + tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + if (owd < 0) + owd = -owd; + } + + if (owd > 0) + lp->flag |= LP_VALID_OWD; + else + lp->flag &= ~LP_VALID_OWD; + + return owd; +} + +/** + * tcp_lp_rtt_sample + * + * Implementation or rtt_sample. + * Will take the following action, + * 1. calc OWD, + * 2. record the min/max OWD, + * 3. calc smoothed OWD (SOWD). + * Most ideas come from the original TCP-LP implementation. + */ +static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +{ + struct lp *lp = inet_csk_ca(sk); + s64 mowd = tcp_lp_owd_calculator(sk); + + /* sorry that we don't have valid data */ + if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) + return; + + /* record the next min owd */ + if (mowd < lp->owd_min) + lp->owd_min = mowd; + + /* always forget the max of the max + * we just set owd_max as one below it */ + if (mowd > lp->owd_max) { + if (mowd > lp->owd_max_rsv) { + if (lp->owd_max_rsv == 0) + lp->owd_max = mowd; + else + lp->owd_max = lp->owd_max_rsv; + lp->owd_max_rsv = mowd; + } else + lp->owd_max = mowd; + } + + /* calc for smoothed owd */ + if (lp->sowd != 0) { + mowd -= lp->sowd >> 3; /* m is now error in owd est */ + lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ + } else + lp->sowd = mowd << 3; /* take the measured time be owd */ +} + +/** + * tcp_lp_pkts_acked + * + * Implementation of pkts_acked. + * Deal with active drop under Early Congestion Indication. + * Only drop to half and 1 will be handle, because we hope to use back + * newReno in increase case. + * We work it out by following the idea from TCP-LP's paper directly + */ +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + + /* calc inference */ + if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) + lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + + /* test if within inference */ + if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; + + /* test if within threshold */ + if (lp->sowd >> 3 < + lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) + lp->flag |= LP_WITHIN_THR; + else + lp->flag &= ~LP_WITHIN_THR; + + pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, + tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, + lp->sowd >> 3); + + if (lp->flag & LP_WITHIN_THR) + return; + + /* FIXME: try to reset owd_min and owd_max here + * so decrease the chance the min/max is no longer suitable + * and will usually within threshold when whithin inference */ + lp->owd_min = lp->sowd >> 3; + lp->owd_max = lp->sowd >> 2; + lp->owd_max_rsv = lp->sowd >> 2; + + /* happened within inference + * drop snd_cwnd into 1 */ + if (lp->flag & LP_WITHIN_INF) + tp->snd_cwnd = 1U; + + /* happened after inference + * cut snd_cwnd into half */ + else + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ + lp->last_drop = tcp_time_stamp; +} + +static struct tcp_congestion_ops tcp_lp = { + .init = tcp_lp_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_lp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_lp_rtt_sample, + .pkts_acked = tcp_lp_pkts_acked, + + .owner = THIS_MODULE, + .name = "lp" +}; + +static int __init tcp_lp_register(void) +{ + BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_lp); +} + +static void __exit tcp_lp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_lp); +} + +module_init(tcp_lp_register); +module_exit(tcp_lp_unregister); + +MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Low Priority"); -- cgit v1.1 From 76f1017757aa0c308a0b83ca611c9a89ee9a79a4 Mon Sep 17 00:00:00 2001 From: Bin Zhou Date: Mon, 5 Jun 2006 17:28:30 -0700 Subject: [TCP]: TCP Veno congestion control TCP Veno module is a new congestion control module to improve TCP performance over wireless networks. The key innovation in TCP Veno is the enhancement of TCP Reno/Sack congestion control algorithm by using the estimated state of a connection based on TCP Vegas. This scheme significantly reduces "blind" reduction of TCP window regardless of the cause of packet loss. This work is based on the research paper "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." C. P. Fu, S. C. Liew, IEEE Journal on Selected Areas in Communication, Feb. 2003. Original paper and many latest research works on veno: http://www.ntu.edu.sg/home/ascpfu/veno/veno.html Signed-off-by: Bin Zhou Cheng Peng Fu Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 12 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_veno.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 net/ipv4/tcp_veno.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index ae85149..8514106 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -560,6 +560,18 @@ config TCP_CONG_LP ``fair share`` of bandwidth as targeted by TCP. See http://www-ece.rice.edu/networks/TCP-LP/ +config TCP_CONG_VENO + tristate "TCP Veno" + depends on EXPERIMENTAL + default n + ---help--- + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 58a7a46..00fcd2c1 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 0000000..1091671 --- /dev/null +++ b/net/ipv4/tcp_veno.c @@ -0,0 +1,238 @@ +/* + * TCP Veno congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * C. P. Fu, S. C. Liew. + * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." + * IEEE Journal on Selected Areas in Communication, + * Feb. 2003. + * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Veno variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static const int beta = 3 << V_PARAM_SHIFT; + +/* Veno variables */ +struct veno { + u8 doing_veno_now; /* if true, do veno for this rtt */ + u16 cntrtt; /* # of rtts measured within last rtt */ + u32 minrtt; /* min of rtts measured within last rtt (in usec) */ + u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ + u32 inc; /* decide whether to increase cwnd */ + u32 diff; /* calculate the diff rate */ +}; + +/* There are several situations when we must "re-start" Veno: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + */ +static inline void veno_enable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn on Veno */ + veno->doing_veno_now = 1; + + veno->minrtt = 0x7fffffff; +} + +static inline void veno_disable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn off Veno */ + veno->doing_veno_now = 0; +} + +static void tcp_veno_init(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + veno->basertt = 0x7fffffff; + veno->inc = 1; + veno_enable(sk); +} + +/* Do rtt sampling needed for Veno. */ +static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct veno *veno = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + + /* Filter to find propagation delay: */ + if (vrtt < veno->basertt) + veno->basertt = vrtt; + + /* Find the min rtt during the last rtt to find + * the current prop. delay + queuing delay: + */ + veno->minrtt = min(veno->minrtt, vrtt); + veno->cntrtt++; +} + +static void tcp_veno_state(struct sock *sk, u8 ca_state) +{ + if (ca_state == TCP_CA_Open) + veno_enable(sk); + else + veno_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Veno calculations + * until we get fresh rtt samples. So when we + * restart, we reset our Veno state to a clean + * state. After we get acks for this flight of + * packets, _then_ we can make Veno calculations + * again. + */ +static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_veno_init(sk); +} + +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (!veno->doing_veno_now) + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + + /* limited by applications */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* We do the Veno calculations only if we got enough rtt samples */ + if (veno->cntrtt <= 2) { + /* We don't have enough rtt samples to do the Veno + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + } else { + u32 rtt, target_cwnd; + + /* We have enough rtt samples, so, using the Veno + * algorithm, we determine the state of the network. + */ + + rtt = veno->minrtt; + + target_cwnd = ((tp->snd_cwnd * veno->basertt) + << V_PARAM_SHIFT) / rtt; + + veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } else { + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc + && tp->snd_cwnd < + tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; + } else + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + + } + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + /* Wipe the slate clean for the next rtt. */ + /* veno->cntrtt = 0; */ + veno->minrtt = 0x7fffffff; +} + +/* Veno MD phase */ +static u32 tcp_veno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (veno->diff < beta) + /* in "non-congestive state", cut cwnd by 1/5 */ + return max(tp->snd_cwnd * 4 / 5, 2U); + else + /* in "congestive state", cut cwnd by 1/2 */ + return max(tp->snd_cwnd >> 1U, 2U); +} + +static u32 tcp_veno_min_cwnd(struct sock * sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh; +} + +static struct tcp_congestion_ops tcp_veno = { + .init = tcp_veno_init, + .ssthresh = tcp_veno_ssthresh, + .cong_avoid = tcp_veno_cong_avoid, + .min_cwnd = tcp_veno_min_cwnd, + .rtt_sample = tcp_veno_rtt_calc, + .set_state = tcp_veno_state, + .cwnd_event = tcp_veno_cwnd_event, + + .owner = THIS_MODULE, + .name = "veno", +}; + +static int __init tcp_veno_register(void) +{ + BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_veno); + return 0; +} + +static void __exit tcp_veno_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_veno); +} + +module_init(tcp_veno_register); +module_exit(tcp_veno_unregister); + +MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Veno"); -- cgit v1.1 From f890f921040fef6a35e39d15b729af1fd1a35f29 Mon Sep 17 00:00:00 2001 From: "Angelo P. Castellani" Date: Mon, 5 Jun 2006 17:29:09 -0700 Subject: [TCP]: TCP Compound congestion control TCP Compound is a sender-side only change to TCP that uses a mixed Reno/Vegas approach to calculate the cwnd. For further details look here: ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf Signed-off-by: Angelo P. Castellani Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_compound.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100644 net/ipv4/tcp_compound.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8514106..da33393 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -572,6 +572,16 @@ config TCP_CONG_VENO loss packets. See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf +config TCP_CONG_COMPOUND + tristate "TCP Compound" + depends on EXPERIMENTAL + default n + ---help--- + TCP Compound is a sender-side only change to TCP that uses + a mixed Reno/Vegas approach to calculate the cwnd. + For further details look here: + ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 00fcd2c1..f30faef 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c new file mode 100644 index 0000000..8c1ebfb --- /dev/null +++ b/net/ipv4/tcp_compound.c @@ -0,0 +1,407 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + * + * + * TCP Compound based on TCP Vegas + * + * further details can be found here: + * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 + +#define TCP_COMPOUND_ALPHA 3U +#define TCP_COMPOUND_BETA 1U +#define TCP_COMPOUND_KAPPA_POW 3 +#define TCP_COMPOUND_KAPPA_NSQRT 2 +#define TCP_COMPOUND_GAMMA 30 +#define TCP_COMPOUND_ZETA 1 + +/* TCP compound variables */ +struct compound { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now; /* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + u32 cwnd; + u32 dwnd; +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void vegas_enable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +static void tcp_compound_init(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(sk); + + vegas->dwnd = 0; + vegas->cwnd = tp->snd_cwnd; +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct compound *vegas = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_compound_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(sk); + else + vegas_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_compound_init(sk); +} + +static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + u8 inc = 0; + + if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { + if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) { + vegas->cwnd = tp->snd_cwnd; + vegas->dwnd = 0; + } else + vegas->cwnd = tp->snd_cwnd - vegas->dwnd; + + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (vegas->cwnd <= tp->snd_ssthresh) + inc = 1; + else if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + inc = 1; + tp->snd_cwnd_cnt = 0; + } + + if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) + vegas->cwnd++; + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + if (!tp->mss_cache) + return; + + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT > 2) { + u32 rtt, target_cwnd, diff; + u32 brtt, dwnd; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + if (!rtt) + return; + + brtt = vegas->baseRTT; + target_cwnd = ((old_wnd * brtt) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + dwnd = vegas->dwnd; + + if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { + u32 i, j, x, x2; + u64 v; + + v = 1; + + for (i = 0; i < TCP_COMPOUND_KAPPA_POW; i++) + v *= old_wnd; + + for (i = 0; i < TCP_COMPOUND_KAPPA_NSQRT; i++) { + x = 1; + for (j = 0; j < 200; j++) { + x2 = (x + v / x) / 2; + + if (x2 == x || !x2) + break; + + x = x2; + } + v = x; + } + + x = (u32) v >> TCP_COMPOUND_ALPHA; + + if (x > 1) + dwnd = x - 1; + else + dwnd = 0; + + dwnd += vegas->dwnd; + + } else if ((dwnd << V_PARAM_SHIFT) < + (diff * TCP_COMPOUND_BETA)) + dwnd = 0; + else + dwnd = + ((dwnd << V_PARAM_SHIFT) - + (diff * + TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; + + vegas->dwnd = dwnd; + + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + tp->snd_cwnd = vegas->cwnd + vegas->dwnd; +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct compound *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure:; + } +} + +static struct tcp_congestion_ops tcp_compound = { + .init = tcp_compound_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_compound_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_compound_rtt_calc, + .set_state = tcp_compound_state, + .cwnd_event = tcp_compound_cwnd_event, + .get_info = tcp_compound_get_info, + + .owner = THIS_MODULE, + .name = "compound", +}; + +static int __init tcp_compound_register(void) +{ + BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_compound); + return 0; +} + +static void __exit tcp_compound_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_compound); +} + +module_init(tcp_compound_register); +module_exit(tcp_compound_unregister); + +MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Compound"); -- cgit v1.1 From a4ed25849532728effaa0665c92e08e029e41407 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:29:39 -0700 Subject: [TCP]: TCP Compound quad root function The original code did a 64 bit divide directly, which won't work on 32 bit platforms. Rather than doing a 64 bit square root twice, just implement a 4th root function in one pass using Newton's method. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_compound.c | 90 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c index 8c1ebfb..ec68cb8 100644 --- a/net/ipv4/tcp_compound.c +++ b/net/ipv4/tcp_compound.c @@ -52,8 +52,6 @@ #define TCP_COMPOUND_ALPHA 3U #define TCP_COMPOUND_BETA 1U -#define TCP_COMPOUND_KAPPA_POW 3 -#define TCP_COMPOUND_KAPPA_NSQRT 2 #define TCP_COMPOUND_GAMMA 30 #define TCP_COMPOUND_ZETA 1 @@ -156,6 +154,58 @@ static void tcp_compound_state(struct sock *sk, u8 ca_state) vegas_disable(sk); } + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u64 div64_64(u64 dividend, u64 divisor) +{ + u32 d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (u32) dividend / d; + + return dividend; +} + +/* calculate the quartic root of "a" using Newton-Raphson */ +static u32 qroot(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * qrt(x) = exp(log(x) / 4) + */ + x = 1u << (fls64(a) >> 2); + + /* + * Iteration based on: + * 3 + * x = ( 3 * x + a / x ) / 4 + * k+1 k k + */ + do { + u64 x3 = x; + + x1 = x; + x3 *= x; + x3 *= x; + + x = (3 * x + (u32) div64_64(a, x3)) / 4; + } while (abs(x1 - x) > 1); + + return x; +} + + /* * If the connection is idle and we are restarting, * then we don't want to do any Vegas calculations @@ -304,29 +354,21 @@ static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, dwnd = vegas->dwnd; if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { - u32 i, j, x, x2; u64 v; - - v = 1; - - for (i = 0; i < TCP_COMPOUND_KAPPA_POW; i++) - v *= old_wnd; - - for (i = 0; i < TCP_COMPOUND_KAPPA_NSQRT; i++) { - x = 1; - for (j = 0; j < 200; j++) { - x2 = (x + v / x) / 2; - - if (x2 == x || !x2) - break; - - x = x2; - } - v = x; - } - - x = (u32) v >> TCP_COMPOUND_ALPHA; - + u32 x; + + /* + * The TCP Compound paper describes the choice + * of "k" determines the agressiveness, + * ie. slope of the response function. + * + * For same value as HSTCP would be 0.8 + * but for computaional reasons, both the + * original authors and this implementation + * use 0.75. + */ + v = old_wnd; + x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA; if (x > 1) dwnd = x - 1; else -- cgit v1.1 From 72dc5b9225c53310c010b68a70ea97c8c8e24bdf Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:08 -0700 Subject: [TCP]: Minimum congestion window consolidation. Many of the TCP congestion methods all just use ssthresh as the minimum congestion window on decrease. Rather than duplicating the code, just have that be the default if that handle in the ops structure is not set. Minor behaviour change to TCP compound. It probably wants to use this (ssthresh) as lower bound, rather than ssthresh/2 because the latter causes undershoot on loss. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 7 ------- net/ipv4/tcp_compound.c | 1 - net/ipv4/tcp_cong.c | 6 +++--- net/ipv4/tcp_cubic.c | 6 ------ net/ipv4/tcp_htcp.c | 9 --------- net/ipv4/tcp_input.c | 13 +++++++++++-- net/ipv4/tcp_veno.c | 7 ------- net/ipv4/tcp_westwood.c | 18 +++++++----------- 8 files changed, 21 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 035f209..b2d9021 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c index ec68cb8..bc54f7e 100644 --- a/net/ipv4/tcp_compound.c +++ b/net/ipv4/tcp_compound.c @@ -419,7 +419,6 @@ static struct tcp_congestion_ops tcp_compound = { .init = tcp_compound_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_compound_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .rtt_sample = tcp_compound_rtt_calc, .set_state = tcp_compound_state, .cwnd_event = tcp_compound_cwnd_event, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 91c2f41..857eefc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + if (!ca->ssthresh || !ca->cong_avoid) { printk(KERN_ERR "TCP %s does not implement required ops\n", ca->name); return -EINVAL; @@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct sock *sk) +/* Lower bound on congestion window with halving. */ +u32 tcp_reno_min_cwnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 31a4986..78b7a6b 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - return tcp_sk(sk)->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1b2ff53..3d92c18 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, } } -/* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - - static void htcp_init(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); @@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state) static struct tcp_congestion_ops htcp = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, - .min_cwnd = htcp_min_cwnd, .cong_avoid = htcp_cong_avoid, .set_state = htcp_state, .undo_cwnd = htcp_cwnd_undo, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6d16788..e08245b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1689,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* Lower bound on congestion window is slow start threshold + * unless congestion avoidance choice decides to overide it. + */ +static inline u32 tcp_cwnd_min(const struct sock *sk) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + + return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + /* Decrease cwnd each second ack. */ static void tcp_cwnd_down(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 1091671..11b42a7 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -199,17 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static u32 tcp_veno_min_cwnd(struct sock * sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static struct tcp_congestion_ops tcp_veno = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, - .min_cwnd = tcp_veno_min_cwnd, .rtt_sample = tcp_veno_rtt_calc, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 0c340c3..29eb258 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -162,12 +162,6 @@ static inline u32 westwood_acked_count(struct sock *sk) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct westwood *w = inet_csk_ca(sk); - return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); -} /* * TCP Westwood @@ -175,9 +169,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct sock *sk) +static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { - return westwood_bw_rttmin(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) @@ -191,11 +187,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_SLOW_ACK: @@ -235,7 +231,7 @@ static struct tcp_congestion_ops tcp_westwood = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_cwnd_min, + .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.1 From a42e9d6ce89cfd19aee9f990b7231ce697f0d00f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:32 -0700 Subject: [TCP]: TCP Probe congestion window tracing This adds a new module for tracking TCP state variables non-intrusively using kprobes. It has a simple /proc interface that outputs one line for each packet received. A sample usage is to collect congestion window and ssthresh over time graphs. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/Kconfig | 15 +++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_probe.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 net/ipv4/tcp_probe.c (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index 4193cdc..ff0db80 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -215,6 +215,21 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. +config NET_TCPPROBE + tristate "TCP connection probing" + depends on INET && EXPERIMENTAL && PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to TCP connection + state in response to incoming patckets. It is used for debugging + TCP congestion avoidance modules. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use the packet generator can be found + at http://linux-net.osdl.org/index.php/TcpProbe + + To compile this code as a module, choose M here: the + module will be called tcp_probe. + endmenu endmenu diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f30faef..38b8039 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 0000000..be94d52 --- /dev/null +++ b/net/ipv4/tcp_probe.c @@ -0,0 +1,179 @@ +/* + * tcpprobe - Observe the TCP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Stephen Hemminger "); +MODULE_DESCRIPTION("TCP cwnd snooper"); +MODULE_LICENSE("GPL"); + +static int port = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static int bufsize = 64*1024; +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +static const char procname[] = "tcpprobe"; + +struct { + struct kfifo *fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timeval tstart; +} tcpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + struct timeval now; + char tbuf[256]; + + va_start(args, fmt); + do_gettimeofday(&now); + + now.tv_sec -= tcpw.tstart.tv_sec; + now.tv_usec -= tcpw.tstart.tv_usec; + if (now.tv_usec < 0) { + --now.tv_sec; + now.tv_usec += 1000000; + } + + len = sprintf(tbuf, "%lu.%06lu ", now.tv_sec, now.tv_usec); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + + kfifo_put(tcpw.fifo, tbuf, len); + wake_up(&tcpw.wait); +} + +static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + if (port == 0 || ntohs(inet->dport) == port || + ntohs(inet->sport) == port) { + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), + size, tp->snd_nxt, tp->snd_una, + tp->snd_cwnd, tcp_current_ssthresh(sk), + tp->snd_wnd); + } + + jprobe_return(); + return 0; +} + +static struct jprobe tcp_send_probe = { + .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, + .entry = (kprobe_opcode_t *) &jtcp_sendmsg, +}; + + +static int tcpprobe_open(struct inode * inode, struct file * file) +{ + kfifo_reset(tcpw.fifo); + do_gettimeofday(&tcpw.tstart); + return 0; +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt; + unsigned char *tbuf; + + if (!buf || len < 0) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(tcpw.wait, + __kfifo_len(tcpw.fifo) != 0); + if (error) + return error; + + cnt = kfifo_get(tcpw.fifo, tbuf, len); + error = copy_to_user(buf, tbuf, cnt); + + vfree(tbuf); + + return error ? error : cnt; +} + +static struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&tcpw.wait); + spin_lock_init(&tcpw.lock); + tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock); + + if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) + goto err0; + + ret = register_jprobe(&tcp_send_probe); + if (ret) + goto err1; + + pr_info("TCP watch registered (port=%d)\n", port); + return 0; + err1: + proc_net_remove(procname); + err0: + kfifo_free(tcpw.fifo); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + kfifo_free(tcpw.fifo); + proc_net_remove(procname); + unregister_jprobe(&tcp_send_probe); + +} +module_exit(tcpprobe_exit); -- cgit v1.1 From 738980ffa658c86bd494ebb242ce8e44aff16a9e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:56 -0700 Subject: [TCP]: Limited slow start for Highspeed TCP Implementation of RFC3742 limited slow start. Added as part of the TCP highspeed congestion control module. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index ba7c63c..1120245 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,6 +98,10 @@ struct hstcp { u32 ai; }; +static int max_ssthresh = 100; +module_param(max_ssthresh, int, 0644); +MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); + static void hstcp_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); - else { + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* RFC3742: limited slow start + * the window is increased by 1/K MSS for each arriving ACK, + * for K = int(cwnd/(0.5 max_ssthresh)) + */ + if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { + u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); + if (++tp->snd_cwnd_cnt >= k) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } else { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && -- cgit v1.1 From 70df2311ee3fc607e7511873d7dade5bd17d593d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2006 17:59:20 -0700 Subject: [TCP]: Fix compile warning in tcp_probe.c The suseconds_t et al. are not necessarily any particular type on every platform, so cast to unsigned long so that we can use one printf format string and avoid warnings across the board Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index be94d52..d7d517a 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -68,7 +68,9 @@ static void printl(const char *fmt, ...) now.tv_usec += 1000000; } - len = sprintf(tbuf, "%lu.%06lu ", now.tv_sec, now.tv_usec); + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_usec); len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); -- cgit v1.1 From 338fcf9886df9ad2873772197a73a57818973316 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Jun 2006 21:04:39 -0700 Subject: [IPV4] igmp: Fixup struct ip_mc_list::multiaddr type All users except two expect 32-bit big-endian value. One is of ->multiaddr = ->multiaddr variety. And last one is "%08lX". Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d512239..ab680c8 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) } seq_printf(seq, - "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_to_clock_t(im->timer.expires-jiffies) : 0, -- cgit v1.1 From 6d7416535097ed0943bdae8e69c14ba43061cab1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Jun 2006 21:06:41 -0700 Subject: [IPV4]: Right prototype of __raw_v4_lookup() All users pass 32-bit values as addresses and internally they're compared with 32-bit entities. So, change "laddr" and "raddr" types to __be32. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/raw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc25624..bd221ec 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk) } struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, + __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node; -- cgit v1.1 From f86502bfc177f69bbabbedb78ebf710579ae0e54 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2006 21:19:24 -0700 Subject: [IPV4] icmp: Kill local 'ip' arg in icmp_redirect(). It is typed wrong, and it's only assigned and used once. So just pass in iph->daddr directly which fixes both problems. Based upon a patch by Alexey Dobriyan. Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2a04559..0179001 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -730,7 +730,6 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { struct iphdr *iph; - unsigned long ip; if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb) goto out; iph = (struct iphdr *)skb->data; - ip = iph->daddr; switch (skb->h.icmph->code & 7) { case ICMP_REDIR_NET: @@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, + ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, + skb->h.icmph->un.gateway, iph->saddr, skb->dev); break; } -- cgit v1.1 From c8c05a8eec6f1258f6d5cb71a44ee5dc1e989b63 Mon Sep 17 00:00:00 2001 From: Catherine Zhang Date: Thu, 8 Jun 2006 23:39:49 -0700 Subject: [LSM-IPsec]: SELinux Authorize This patch contains a fix for the previous patch that adds security contexts to IPsec policies and security associations. In the previous patch, no authorization (besides the check for write permissions to SAD and SPD) is required to delete IPsec policies and security assocations with security contexts. Thus a user authorized to change SAD and SPD can bypass the IPsec policy authorization by simply deleteing policies with security contexts. To fix this security hole, an additional authorization check is added for removing security policies and security associations with security contexts. Note that if no security context is supplied on add or present on policy to be deleted, the SELinux module allows the change unconditionally. The hook is called on deletion when no context is present, which we may want to change. At present, I left it up to the module. LSM changes: The patch adds two new LSM hooks: xfrm_policy_delete and xfrm_state_delete. The new hooks are necessary to authorize deletion of IPsec policies that have security contexts. The existing hooks xfrm_policy_free and xfrm_state_free lack the context to do the authorization, so I decided to split authorization of deletion and memory management of security data, as is typical in the LSM interface. Use: The new delete hooks are checked when xfrm_policy or xfrm_state are deleted by either the xfrm_user interface (xfrm_get_policy, xfrm_del_sa) or the pfkey interface (pfkey_spddelete, pfkey_delete). SELinux changes: The new policy_delete and state_delete functions are added. Signed-off-by: Catherine Zhang Signed-off-by: Trent Jaeger Acked-by: James Morris Signed-off-by: David S. Miller --- net/key/af_key.c | 17 +++++++++++------ net/xfrm/xfrm_user.c | 19 ++++++++++++------- 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 8595822..d5e2121 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h if (x == NULL) return -ESRCH; + if ((err = security_xfrm_state_delete(x))) + goto out; + if (xfrm_state_kern(x)) { - xfrm_state_put(x); - return -EPERM; + err = -EPERM; + goto out; } err = xfrm_state_delete(x); - if (err < 0) { - xfrm_state_put(x); - return err; - } + if (err < 0) + goto out; c.seq = hdr->sadb_msg_seq; c.pid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); +out: xfrm_state_put(x); return err; @@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg err = 0; + if ((err = security_xfrm_policy_delete(xp))) + goto out; c.seq = hdr->sadb_msg_seq; c.pid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); +out: xfrm_pol_put(xp); return err; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 81d1005..a3733d2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) if (x == NULL) return -ESRCH; + if (err = security_xfrm_state_delete(x)) + goto out; + if (xfrm_state_kern(x)) { - xfrm_state_put(x); - return -EPERM; + err = -EPERM; + goto out; } err = xfrm_state_delete(x); - if (err < 0) { - xfrm_state_put(x); - return err; - } + if (err < 0) + goto out; c.seq = nlh->nlmsg_seq; c.pid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; km_state_notify(x, &c); - xfrm_state_put(x); +out: + xfrm_state_put(x); return err; } @@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr MSG_DONTWAIT); } } else { + if (err = security_xfrm_policy_delete(xp)) + goto out; c.data.byid = p->index; c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; @@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr xfrm_pol_put(xp); +out: return err; } -- cgit v1.1 From 9dadaa19cb11a8db38072a92a3f95deab7a797fb Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 8 Jun 2006 23:42:09 -0700 Subject: [NET]: NET_TCPPROBE Kconfig fix Just spotted this typo in a new option. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index ff0db80..ccadc8e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -220,7 +220,7 @@ config NET_TCPPROBE depends on INET && EXPERIMENTAL && PROC_FS && KPROBES ---help--- This module allows for capturing the changes to TCP connection - state in response to incoming patckets. It is used for debugging + state in response to incoming packets. It is used for debugging TCP congestion avoidance modules. If you don't understand what was just said, you don't need it: say N. -- cgit v1.1 From 6f68dc37759b1d6ff3b4d4a9d097605a09f8f043 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 8 Jun 2006 23:58:52 -0700 Subject: [NET]: Fix warnings after LSM-IPSEC changes. Assignment used as truth value in xfrm_del_sa() and xfrm_get_policy(). Wrong argument type declared for security_xfrm_state_delete() when SELINUX is disabled. Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a3733d2..c21dc26 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -427,7 +427,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) if (x == NULL) return -ESRCH; - if (err = security_xfrm_state_delete(x)) + if ((err = security_xfrm_state_delete(x)) != 0) goto out; if (xfrm_state_kern(x)) { @@ -1057,7 +1057,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr MSG_DONTWAIT); } } else { - if (err = security_xfrm_policy_delete(xp)) + if ((err = security_xfrm_policy_delete(xp)) != 0) goto out; c.data.byid = p->index; c.event = nlh->nlmsg_type; -- cgit v1.1 From 984bc16cc92ea3c247bf34ad667cfb95331b9d3c Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:29:17 -0700 Subject: [SECMARK]: Add secmark support to core networking. Add a secmark field to the skbuff structure, to allow security subsystems to place security markings on network packets. This is similar to the nfmark field, except is intended for implementing security policy, rather than than networking policy. This patch was already acked in principle by Dave Miller. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/Kconfig | 7 +++++++ net/core/skbuff.c | 3 ++- net/ipv4/ip_output.c | 1 + net/ipv4/netfilter/ipt_REJECT.c | 1 + net/ipv6/ip6_output.c | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index ccadc8e..c6cec5a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -66,6 +66,13 @@ source "net/ipv6/Kconfig" endif # if INET +config NETWORK_SECMARK + bool "Security Marking" + help + This enables security marking of network packets, similar + to nfmark, but designated for security purposes. + If you are unsure how to answer this question, answer N. + menuconfig NETFILTER bool "Network packet filtering (replaces ipchains)" ---help--- diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb3770f..96cdcbe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->tc_verd = CLR_TC_MUNGED(n->tc_verd); C(input_dev); #endif - + skb_copy_secmark(n, skb); #endif C(truesize); atomic_set(&n->users, 1); @@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->tc_index = old->tc_index; #endif + skb_copy_secmark(new, old); atomic_set(&new->users, 1); skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cff9c3a..d4bb3fa 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } /* diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0bba3c2..431a3ce 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); nskb->nfmark = 0; + skb_init_secmark(nskb); tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 416f6e4..d29620f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -459,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) -- cgit v1.1 From 5e6874cdb8de94cd3c15d853a8ef9c6f4c305055 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:30:57 -0700 Subject: [SECMARK]: Add xtables SECMARK target Add a SECMARK target to xtables, allowing the admin to apply security marks to packets via both iptables and ip6tables. The target currently handles SELinux security marking, but can be extended for other purposes as needed. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 9 +++ net/netfilter/Makefile | 1 + net/netfilter/xt_SECMARK.c | 156 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 net/netfilter/xt_SECMARK.c (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 85a7e17..10eccdd 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -174,6 +174,15 @@ config NETFILTER_XT_TARGET_NOTRACK If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NETFILTER_XT_TARGET_SECMARK + tristate '"SECMARK" target support' + depends on NETFILTER_XTABLES && NETWORK_SECMARK + help + The SECMARK target allows security marking of network + packets, for use with security subsystems. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index df1da25f..3645de0 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c new file mode 100644 index 0000000..c2ce9c4 --- /dev/null +++ b/net/netfilter/xt_SECMARK.c @@ -0,0 +1,156 @@ +/* + * Module for modifying the secmark field of the skb, for use by + * security subsystems. + * + * Based on the nfmark match by: + * (C) 1999-2001 Marc Boucher + * + * (C) 2006 Red Hat, Inc., James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("ip[6]tables SECMARK modification module"); +MODULE_ALIAS("ipt_SECMARK"); +MODULE_ALIAS("ip6t_SECMARK"); + +#define PFX "SECMARK: " + +static u8 mode; + +static unsigned int target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, + const void *targinfo, void *userinfo) +{ + u32 secmark = 0; + const struct xt_secmark_target_info *info = targinfo; + + BUG_ON(info->mode != mode); + + switch (mode) { + case SECMARK_MODE_SEL: + secmark = info->u.sel.selsid; + break; + + default: + BUG(); + } + + if ((*pskb)->secmark != secmark) + (*pskb)->secmark = secmark; + + return XT_CONTINUE; +} + +static int checkentry_selinux(struct xt_secmark_target_info *info) +{ + int err; + struct xt_secmark_target_selinux_info *sel = &info->u.sel; + + err = selinux_string_to_sid(sel->selctx, &sel->selsid); + if (err) { + if (err == -EINVAL) + printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n", + sel->selctx); + return 0; + } + + if (!sel->selsid) { + printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n", + sel->selctx); + return 0; + } + + err = selinux_relabel_packet_permission(sel->selsid); + if (err) { + printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); + return 0; + } + + return 1; +} + +static int checkentry(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + struct xt_secmark_target_info *info = targinfo; + + if (mode && mode != info->mode) { + printk(KERN_INFO PFX "mode already set to %hu cannot mix with " + "rules for mode %hu\n", mode, info->mode); + return 0; + } + + switch (info->mode) { + case SECMARK_MODE_SEL: + if (!checkentry_selinux(info)) + return 0; + break; + + default: + printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); + return 0; + } + + if (!mode) + mode = info->mode; + return 1; +} + +static struct xt_target ipt_secmark_reg = { + .name = "SECMARK", + .target = target, + .targetsize = sizeof(struct xt_secmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET, + .revision = 0, +}; + +static struct xt_target ip6t_secmark_reg = { + .name = "SECMARK", + .target = target, + .targetsize = sizeof(struct xt_secmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET6, + .revision = 0, +}; + +static int __init xt_secmark_init(void) +{ + int err; + + err = xt_register_target(&ipt_secmark_reg); + if (err) + return err; + + err = xt_register_target(&ip6t_secmark_reg); + if (err) + xt_unregister_target(&ipt_secmark_reg); + + return err; +} + +static void __exit xt_secmark_fini(void) +{ + xt_unregister_target(&ip6t_secmark_reg); + xt_unregister_target(&ipt_secmark_reg); +} + +module_init(xt_secmark_init); +module_exit(xt_secmark_fini); -- cgit v1.1 From 7c9728c393dceb724d66d696cfabce82151a78e5 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:31:46 -0700 Subject: [SECMARK]: Add secmark support to conntrack Add a secmark field to IP and NF conntracks, so that security markings on packets can be copied to their associated connections, and also copied back to packets as required. This is similar to the network mark field currently used with conntrack, although it is intended for enforcement of security policy rather than network policy. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 12 ++++++++++++ net/ipv4/netfilter/ip_conntrack_core.c | 3 +++ net/ipv4/netfilter/ip_conntrack_standalone.c | 5 +++++ net/netfilter/Kconfig | 12 ++++++++++++ net/netfilter/nf_conntrack_core.c | 3 +++ net/netfilter/nf_conntrack_standalone.c | 5 +++++ 6 files changed, 40 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ff4b118..e1d7f5f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on IP_NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config IP_NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && IP_NF_CONNTRACK diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 4fe9e69..7e4cf9a 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, /* this is ugly, but there is no other place where to put it */ conntrack->nat.masq_index = exp->master->nat.masq_index; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; +#endif nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 6cb9b98..88445aa 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 10eccdd..023f81e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && NF_CONNTRACK diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bc2bd4c..cd299f4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -990,6 +990,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, #ifdef CONFIG_NF_CONNTRACK_MARK conntrack->mark = exp->master->mark; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; +#endif nf_conntrack_get(&conntrack->master->ct_general); NF_CT_STAT_INC(expect_new); } else diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e01d20d..e34c574 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; -- cgit v1.1 From 100468e9c05c10fb6872751c1af523b996d6afa9 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:32:39 -0700 Subject: [SECMARK]: Add CONNSECMARK xtables target Add a new xtables target, CONNSECMARK, which is used to specify rules for copying security marks from packets to connections, and for copyying security marks back from connections to packets. This is similar to the CONNMARK target, but is more limited in scope in that it only allows copying of security marks to and from packets, as this is all it needs to do. A typical scenario would be to apply a security mark to a 'new' packet with SECMARK, then copy that to its conntrack via CONNMARK, and then restore the security mark from the connection to established and related packets on that connection. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 11 +++ net/netfilter/Makefile | 1 + net/netfilter/xt_CONNSECMARK.c | 155 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 net/netfilter/xt_CONNSECMARK.c (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 023f81e..b1622b7 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -195,6 +195,17 @@ config NETFILTER_XT_TARGET_SECMARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK) + help + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3645de0..6fa4b75 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c new file mode 100644 index 0000000..8c011e0 --- /dev/null +++ b/net/netfilter/xt_CONNSECMARK.c @@ -0,0 +1,155 @@ +/* + * This module is used to copy security markings from packets + * to connections, and restore security markings from connections + * back to packets. This would normally be performed in conjunction + * with the SECMARK target and state match. + * + * Based somewhat on CONNMARK: + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * (C) 2006 Red Hat, Inc., James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include + +#define PFX "CONNSECMARK: " + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module"); +MODULE_ALIAS("ipt_CONNSECMARK"); +MODULE_ALIAS("ip6t_CONNSECMARK"); + +/* + * If the packet has a security mark and the connection does not, copy + * the security mark from the packet to the connection. + */ +static void secmark_save(struct sk_buff *skb) +{ + if (skb->secmark) { + u32 *connsecmark; + enum ip_conntrack_info ctinfo; + + connsecmark = nf_ct_get_secmark(skb, &ctinfo); + if (connsecmark && !*connsecmark) + if (*connsecmark != skb->secmark) + *connsecmark = skb->secmark; + } +} + +/* + * If packet has no security mark, and the connection does, restore the + * security mark from the connection to the packet. + */ +static void secmark_restore(struct sk_buff *skb) +{ + if (!skb->secmark) { + u32 *connsecmark; + enum ip_conntrack_info ctinfo; + + connsecmark = nf_ct_get_secmark(skb, &ctinfo); + if (connsecmark && *connsecmark) + if (skb->secmark != *connsecmark) + skb->secmark = *connsecmark; + } +} + +static unsigned int target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, + const void *targinfo, void *userinfo) +{ + struct sk_buff *skb = *pskb; + const struct xt_connsecmark_target_info *info = targinfo; + + switch (info->mode) { + case CONNSECMARK_SAVE: + secmark_save(skb); + break; + + case CONNSECMARK_RESTORE: + secmark_restore(skb); + break; + + default: + BUG(); + } + + return XT_CONTINUE; +} + +static int checkentry(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + struct xt_connsecmark_target_info *info = targinfo; + + switch (info->mode) { + case CONNSECMARK_SAVE: + case CONNSECMARK_RESTORE: + break; + + default: + printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); + return 0; + } + + return 1; +} + +static struct xt_target ipt_connsecmark_reg = { + .name = "CONNSECMARK", + .target = target, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET, + .revision = 0, +}; + +static struct xt_target ip6t_connsecmark_reg = { + .name = "CONNSECMARK", + .target = target, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET6, + .revision = 0, +}; + +static int __init xt_connsecmark_init(void) +{ + int err; + + need_conntrack(); + + err = xt_register_target(&ipt_connsecmark_reg); + if (err) + return err; + + err = xt_register_target(&ip6t_connsecmark_reg); + if (err) + xt_unregister_target(&ipt_connsecmark_reg); + + return err; +} + +static void __exit xt_connsecmark_fini(void) +{ + xt_unregister_target(&ip6t_connsecmark_reg); + xt_unregister_target(&ipt_connsecmark_reg); +} + +module_init(xt_connsecmark_init); +module_exit(xt_connsecmark_fini); -- cgit v1.1 From a0e889bb1bdc083dbbdb02cce9698765847b841f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:17:41 -0700 Subject: [NETFILTER]: recent match: fix "sleeping function called from invalid context" create_proc_entry must not be called with locks held. Use a mutex instead to protect data only changed in user context. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 9686c4d..9b09e48 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -69,6 +69,7 @@ struct recent_table { static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_dir; @@ -249,7 +250,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return 0; - spin_lock_bh(&recent_lock); + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (t != NULL) { t->refcnt++; @@ -258,7 +259,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, } t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, - GFP_ATOMIC); + GFP_KERNEL); if (t == NULL) goto out; strcpy(t->name, info->name); @@ -274,10 +275,12 @@ ipt_recent_checkentry(const char *tablename, const void *ip, t->proc->proc_fops = &recent_fops; t->proc->data = t; #endif + spin_lock_bh(&recent_lock); list_add_tail(&t->list, &tables); + spin_unlock_bh(&recent_lock); ret = 1; out: - spin_unlock_bh(&recent_lock); + mutex_unlock(&recent_mutex); return ret; } @@ -288,17 +291,19 @@ ipt_recent_destroy(const struct xt_match *match, void *matchinfo, const struct ipt_recent_info *info = matchinfo; struct recent_table *t; - spin_lock_bh(&recent_lock); + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); list_del(&t->list); + spin_unlock_bh(&recent_lock); recent_table_flush(t); #ifdef CONFIG_PROC_FS remove_proc_entry(t->name, proc_dir); #endif kfree(t); } - spin_unlock_bh(&recent_lock); + mutex_unlock(&recent_mutex); } #ifdef CONFIG_PROC_FS -- cgit v1.1 From 2b2283d0302d520f08ded41c2ca17886dfbb865a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:18:17 -0700 Subject: [NETFILTER]: recent match: missing refcnt initialization Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 9b09e48..61a2139 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -262,6 +262,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, GFP_KERNEL); if (t == NULL) goto out; + t->refcnt = 1; strcpy(t->name, info->name); INIT_LIST_HEAD(&t->lru_list); for (i = 0; i < ip_list_hash_size; i++) -- cgit v1.1 From bf0857ea32addb6bc8b46383604b218b8ec09f19 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:18:47 -0700 Subject: [NETFILTER]: hashlimit match: fix random initialization hashlimit does: if (!ht->rnd) get_random_bytes(&ht->rnd, 4); ignoring that 0 is also a valid random number. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 85edfb7..92980ab 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -80,6 +80,7 @@ struct ipt_hashlimit_htable { /* used internally */ spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ + int rnd_initialized; struct timer_list timer; /* timer for gc */ atomic_t count; /* number entries in table */ @@ -134,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) /* initialize hash with random val at the time we allocate * the first hashtable entry */ - if (!ht->rnd) + if (!ht->rnd_initialized) { get_random_bytes(&ht->rnd, 4); + ht->rnd_initialized = 1; + } if (ht->cfg.max && atomic_read(&ht->count) >= ht->cfg.max) { @@ -214,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo) atomic_set(&hinfo->count, 0); atomic_set(&hinfo->use, 1); - hinfo->rnd = 0; + hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); if (!hinfo->pde) { -- cgit v1.1 From 932ff279a43ab7257942cddff2595acd541cc49b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 12:20:56 -0700 Subject: [NET]: Add netif_tx_lock Various drivers use xmit_lock internally to synchronise with their transmission routines. They do so without setting xmit_lock_owner. This is fine as long as netpoll is not in use. With netpoll it is possible for deadlocks to occur if xmit_lock_owner isn't set. This is because if a printk occurs while xmit_lock is held and xmit_lock_owner is not set can cause netpoll to attempt to take xmit_lock recursively. While it is possible to resolve this by getting netpoll to use trylock, it is suboptimal because netpoll's sole objective is to maximise the chance of getting the printk out on the wire. So delaying or dropping the message is to be avoided as much as possible. So the only alternative is to always set xmit_lock_owner. The following patch does this by introducing the netif_tx_lock family of functions that take care of setting/unsetting xmit_lock_owner. I renamed xmit_lock to _xmit_lock to indicate that it should not be used directly. I didn't provide irq versions of the netif_tx_lock functions since xmit_lock is meant to be a BH-disabling lock. This is pretty much a straight text substitution except for a small bug fix in winbond. It currently uses netif_stop_queue/spin_unlock_wait to stop transmission. This is unsafe as an IRQ can potentially wake up the queue. So it is safer to use netif_tx_disable. The hamradio bits used spin_lock_irq but it is unnecessary as xmit_lock must never be taken in an IRQ handler. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/atm/clip.c | 4 ++-- net/core/dev.c | 12 +++++------- net/core/dev_mcast.c | 28 ++++++++++++++-------------- net/core/netpoll.c | 9 +++------ net/core/pktgen.c | 4 ++-- net/sched/sch_generic.c | 28 ++++++++++++---------------- net/sched/sch_teql.c | 9 +++------ 7 files changed, 41 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index 72d8529..f92f9c9 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } - spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */ + netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ entry->neigh->used = jiffies; for (walk = &entry->vccs; *walk; walk = &(*walk)->next) if (*walk == clip_vcc) { @@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc " "0x%p)\n", entry, clip_vcc); out: - spin_unlock_bh(&entry->neigh->dev->xmit_lock); + netif_tx_unlock_bh(entry->neigh->dev); } /* The neighbour entry n->lock is held. */ diff --git a/net/core/dev.c b/net/core/dev.c index 6bfa78c..1b09f1c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1282,15 +1282,13 @@ int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - spin_lock(&dev->xmit_lock); \ - dev->xmit_lock_owner = cpu; \ + netif_tx_lock(dev); \ } \ } #define HARD_TX_UNLOCK(dev) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - dev->xmit_lock_owner = -1; \ - spin_unlock(&dev->xmit_lock); \ + netif_tx_unlock(dev); \ } \ } @@ -1389,8 +1387,8 @@ int dev_queue_xmit(struct sk_buff *skb) /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... - Really, it is unlikely that xmit_lock protection is necessary here. - (f.e. loopback and IP tunnels are clean ignoring statistics + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. @@ -2805,7 +2803,7 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); spin_lock_init(&dev->queue_lock); - spin_lock_init(&dev->xmit_lock); + spin_lock_init(&dev->_xmit_lock); dev->xmit_lock_owner = -1; #ifdef CONFIG_NET_CLS_ACT spin_lock_init(&dev->ingress_lock); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 05d60850..c57d887 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -62,7 +62,7 @@ * Device mc lists are changed by bh at least if IPv6 is enabled, * so that it must be bh protected. * - * We block accesses to device mc filters with dev->xmit_lock. + * We block accesses to device mc filters with netif_tx_lock. */ /* @@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev) void dev_mc_upload(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); __dev_mc_upload(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } /* @@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) int err = 0; struct dev_mc_list *dmi, **dmip; - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { /* @@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) */ __dev_mc_upload(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; } } err = -ENOENT; done: - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return err; } @@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC); - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && dmi->dmi_addrlen == alen) { @@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) } if ((dmi = dmi1) == NULL) { - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return -ENOMEM; } memcpy(dmi->dmi_addr, addr, alen); @@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) __dev_mc_upload(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; done: - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); kfree(dmi1); return err; } @@ -204,7 +204,7 @@ done: void dev_mc_discard(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); while (dev->mc_list != NULL) { struct dev_mc_list *tmp = dev->mc_list; @@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev) } dev->mc_count = 0; - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } #ifdef CONFIG_PROC_FS @@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct dev_mc_list *m; struct net_device *dev = v; - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); for (m = dev->mc_list; m; m = m->next) { int i; @@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) seq_putc(seq, '\n'); } - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e8e05ce..9cb7818 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) do { npinfo->tries--; - spin_lock(&np->dev->xmit_lock); - np->dev->xmit_lock_owner = smp_processor_id(); + netif_tx_lock(np->dev); /* * network drivers do not expect to be called if the queue is * stopped. */ if (netif_queue_stopped(np->dev)) { - np->dev->xmit_lock_owner = -1; - spin_unlock(&np->dev->xmit_lock); + netif_tx_unlock(np->dev); netpoll_poll(np); udelay(50); continue; } status = np->dev->hard_start_xmit(skb, np->dev); - np->dev->xmit_lock_owner = -1; - spin_unlock(&np->dev->xmit_lock); + netif_tx_unlock(np->dev); /* success */ if(!status) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index c23e9c0..67ed14d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) } } - spin_lock_bh(&odev->xmit_lock); + netif_tx_lock_bh(odev); if (!netif_queue_stopped(odev)) { atomic_inc(&(pkt_dev->skb->users)); @@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->next_tx_ns = 0; } - spin_unlock_bh(&odev->xmit_lock); + netif_tx_unlock_bh(odev); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 138ea92..b1e4c5e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev) dev->queue_lock serializes queue accesses for this device AND dev->qdisc pointer itself. - dev->xmit_lock serializes accesses to device driver. + netif_tx_lock serializes accesses to device driver. - dev->queue_lock and dev->xmit_lock are mutually exclusive, + dev->queue_lock and netif_tx_lock are mutually exclusive, if one is grabbed, another must be free. */ @@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev) * will be requeued. */ if (!nolock) { - if (!spin_trylock(&dev->xmit_lock)) { + if (!netif_tx_trylock(dev)) { collision: /* So, someone grabbed the driver. */ @@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev) __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } - /* Remember that the driver is grabbed by us. */ - dev->xmit_lock_owner = smp_processor_id(); } { @@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev) ret = dev->hard_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) { if (!nolock) { - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); return -1; @@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev) /* NETDEV_TX_BUSY - we need to requeue */ /* Release the driver */ if (!nolock) { - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); q = dev->qdisc; @@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg) { struct net_device *dev = (struct net_device *)arg; - spin_lock(&dev->xmit_lock); + netif_tx_lock(dev); if (dev->qdisc != &noop_qdisc) { if (netif_device_present(dev) && netif_running(dev) && @@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg) dev_hold(dev); } } - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); dev_put(dev); } @@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev) static void dev_watchdog_up(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); __netdev_watchdog_up(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } static void dev_watchdog_down(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) dev_put(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } void netif_carrier_on(struct net_device *dev) @@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev) while (test_bit(__LINK_STATE_SCHED, &dev->state)) yield(); - spin_unlock_wait(&dev->xmit_lock); + spin_unlock_wait(&dev->_xmit_lock); } void dev_init_scheduler(struct net_device *dev) diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 79b8ef3..4c16ad5 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -302,20 +302,17 @@ restart: switch (teql_resolve(skb, skb_res, slave)) { case 0: - if (spin_trylock(&slave->xmit_lock)) { - slave->xmit_lock_owner = smp_processor_id(); + if (netif_tx_trylock(slave)) { if (!netif_queue_stopped(slave) && slave->hard_start_xmit(skb, slave) == 0) { - slave->xmit_lock_owner = -1; - spin_unlock(&slave->xmit_lock); + netif_tx_unlock(slave); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); master->stats.tx_packets++; master->stats.tx_bytes += len; return 0; } - slave->xmit_lock_owner = -1; - spin_unlock(&slave->xmit_lock); + netif_tx_unlock(slave); } if (netif_queue_stopped(dev)) busy = 1; -- cgit v1.1 From 364c6badde0dd62a0a38e5ed67f85d87d6665780 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:10:40 -0700 Subject: [NET]: Clean up skb_linearize The linearisation operation doesn't need to be super-optimised. So we can replace __skb_linearize with __pskb_pull_tail which does the same thing but is more general. Also, most users of skb_linearize end up testing whether the skb is linear or not so it helps to make skb_linearize do just that. Some callers of skb_linearize also use it to copy cloned data, so it's useful to have a new function skb_linearize_cow to copy the data if it's either non-linear or cloned. Last but not least, I've removed the gfp argument since nobody uses it anymore. If it's ever needed we can easily add it back. Misc bugs fixed by this patch: * via-velocity error handling (also, no SG => no frags) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 63 ++------------------------------------------------ net/decnet/dn_nsp_in.c | 3 +-- net/decnet/dn_route.c | 3 +-- net/ipv4/ipcomp.c | 11 +++------ net/ipv6/ipcomp6.c | 11 +++------ 5 files changed, 10 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1b09f1c..91361bc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1222,64 +1222,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif -/* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) -{ - unsigned int size; - u8 *data; - long offset; - struct skb_shared_info *ninfo; - int headerlen = skb->data - skb->head; - int expand = (skb->tail + skb->data_len) - skb->end; - - if (skb_shared(skb)) - BUG(); - - if (expand <= 0) - expand = 0; - - size = skb->end - skb->head + expand; - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - return -ENOMEM; - - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) - BUG(); - - /* Set up shinfo */ - ninfo = (struct skb_shared_info*)(data + size); - atomic_set(&ninfo->dataref, 1); - ninfo->tso_size = skb_shinfo(skb)->tso_size; - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; - - /* Offset between the two in bytes */ - offset = data - skb->head; - - /* Free old data. */ - skb_release_data(skb); - - skb->head = data; - skb->end = data + size; - - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; - - skb->tail += skb->data_len; - skb->data_len = 0; - return 0; -} - #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ @@ -1326,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* Fragmented skb is linearized if device does not support SG, @@ -1335,7 +1277,7 @@ int dev_queue_xmit(struct sk_buff *skb) */ if (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support @@ -3473,7 +3415,6 @@ subsys_initcall(net_dev_init); EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(__skb_linearize); EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 547523b..a2ba9db 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -801,8 +801,7 @@ got_it: * We linearize everything except data segments here. */ if (cb->nsp_flags & ~0x60) { - if (unlikely(skb_is_nonlinear(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) + if (unlikely(skb_linearize(skb))) goto free_out; } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e172cf9..5abf705 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type padlen); if (flags & DN_RT_PKT_CNTL) { - if (unlikely(skb_is_nonlinear(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) + if (unlikely(skb_linearize(skb))) goto dump_it; switch(flags & DN_RT_CNTL_MSK) { diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 8e24358..3ed8b57 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -80,15 +80,12 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - int err = 0; + int err = -ENOMEM; struct iphdr *iph; struct ip_comp_hdr *ipch; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; @@ -158,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } err = ipcomp_compress(x, skb); iph = skb->nh.iph; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index cec3be5..f28cd37 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -65,7 +65,7 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { - int err = 0; + int err = -ENOMEM; struct ipv6hdr *iph; struct ipv6_comp_hdr *ipch; int plen, dlen; @@ -74,11 +74,8 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) struct crypto_tfm *tfm; int cpu; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; @@ -142,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } /* compression */ plen = skb->len - hdr_len; -- cgit v1.1 From b38dfee3d616ffadb58d4215e3ff9d1d7921031e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:13:01 -0700 Subject: [NET]: skb_trim audit I found a few more spots where pskb_trim_rcsum could be used but were not. This patch changes them to use it. Also, sk_filter can get paged skb data. Therefore we must use pskb_trim instead of skb_trim. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 14 +++----------- net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++------- 2 files changed, 6 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3da9264..3e41f9d 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + sizeof(struct ipv6hdr) > skb->len) goto inhdr_error; - if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { - if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr))) - goto inhdr_error; - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) + goto inhdr_error; } if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) goto inhdr_error; @@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, if (skb->len < len || len < 4 * iph->ihl) goto inhdr_error; - if (skb->len > len) { - __pskb_trim(skb, len); - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + pskb_trim_rcsum(skb, len); nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3e31903..c32a029 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, DEBUGP("queue: message is too short.\n"); goto err; } - if (end-offset < skb->len) { - if (pskb_trim(skb, end - offset)) { - DEBUGP("Can't trim\n"); - goto err; - } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; + if (pskb_trim_rcsum(skb, end - offset)) { + DEBUGP("Can't trim\n"); + goto err; } /* Find out which fragments are in front and at the back of us -- cgit v1.1 From 3cc0e873986fe594d0e96d07259b11f755325cb2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:13:38 -0700 Subject: [NET]: Warn in __skb_trim if skb is paged It's better to warn and fail rather than rarely triggering BUG on paths that incorrectly call skb_trim/__skb_trim on a non-linear skb. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 96cdcbe..bb7210f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -801,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad) return nskb; } -/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. - * If realloc==0 and trimming is impossible without change of data, - * it is BUG(). +/* Trims skb to length len. It can change skb pointers. */ -int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +int ___pskb_trim(struct sk_buff *skb, unsigned int len) { int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)->nr_frags; @@ -816,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) int end = offset + skb_shinfo(skb)->frags[i].size; if (end > len) { if (skb_cloned(skb)) { - BUG_ON(!realloc); if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; } -- cgit v1.1 From f8d596211291a8d98efa47ae0261326218f310cf Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 10 Jun 2006 18:05:35 -0700 Subject: [IPX]: Endian bug in ipxrtr_route_packet() Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipx/ipx_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index a394c6f..bba3431 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -238,7 +238,7 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, } /* Apply checksum. Not allowed on 802.3 links. */ - if (sk->sk_no_check || intrfc->if_dlink_type == IPX_FRAME_8023) + if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023)) ipx->ipx_checksum = 0xFFFF; else ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); -- cgit v1.1 From bdeb04c6d9a957ae2a51c3033414467b82b2a736 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 11 Jun 2006 21:20:38 -0700 Subject: [NET]: net.ipv4.ip_autoconfig sysctl removal The sysctl net.ipv4.ip_autoconfig is a legacy value that is not used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6a6aa53..e7fb665 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -182,14 +182,6 @@ ctl_table ipv4_table[] = { .strategy = &ipv4_doint_and_flush_strategy, }, { - .ctl_name = NET_IPV4_AUTOCONFIG, - .procname = "ip_autoconfig", - .data = &ipv4_config.autoconfig, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_IPV4_NO_PMTU_DISC, .procname = "ip_no_pmtu_disc", .data = &ipv4_config.no_pmtu_disc, -- cgit v1.1 From f61e29018a30c738e1298e1b13be956aa17ee17b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 11 Jun 2006 23:01:02 -0700 Subject: [TCP] Westwood: fix first sample Need to update send sequence number tracking after first ack. Rework of patch from Luca De Cicco. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 29eb258..62a96b7 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -22,6 +22,7 @@ struct westwood { u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ + u8 first_ack; /* flag which infers that this is the first ack */ }; @@ -52,6 +53,7 @@ static void tcp_westwood_init(struct sock *sk) w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 1; } /* @@ -91,6 +93,15 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; + /* Initialise w->snd_una with the first acked sequence number in order + * to fix mismatch between tp->snd_una and w->snd_una for the first + * bandwidth sample + */ + if (w->first_ack) { + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 0; + } + /* * See if a RTT-window has passed. * Be careful since if RTT is less than @@ -180,7 +191,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - + switch(event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); -- cgit v1.1 From b7d7a9e3c900f0733bf2aabdd41e6dbc70eae94b Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:01:39 -0700 Subject: [TCP] Westwood: comment fixes Cleanup some comments and add more references Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 62a96b7..12a2cd9 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -1,7 +1,24 @@ /* - * TCP Westwood+ + * TCP Westwood+: end-to-end bandwidth estimation for TCP * - * Angelo Dell'Aera: TCP Westwood+ support + * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 + * + * Support at http://c3lab.poliba.it/index.php/Westwood + * Main references in literature: + * + * - Mascolo S, Casetti, M. Gerla et al. + * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 + * + * - A. Grieco, s. Mascolo + * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer + * Comm. Review, 2004 + * + * - A. Dell'Aera, L. Grieco, S. Mascolo. + * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : + * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 + * + * Westwood+ employs end-to-end bandwidth measurement to set cwnd and + * ssthresh after packet loss. The probing phase is as the original Reno. */ #include @@ -93,7 +110,7 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; - /* Initialise w->snd_una with the first acked sequence number in order + /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first * bandwidth sample */ @@ -191,7 +208,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - + switch(event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); -- cgit v1.1 From b3a92eabe5b67bd207a38ae13dd51f4e08c1f6f7 Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:01:59 -0700 Subject: [TCP] Westwood: bandwidth filter startup The bandwidth estimate filter is now initialized with the first sample in order to have better performances in the case of small file transfers. Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 12a2cd9..edf2ee1 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -82,10 +82,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b) return (((7 * a) + b) >> 3); } -static inline void westwood_filter(struct westwood *w, u32 delta) +static void westwood_filter(struct westwood *w, u32 delta) { - w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); - w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + /* If the filter is empty fill it with the first sample of bandwidth */ + if (w->bw_ns_est == 0 && w->bw_est == 0) { + w->bw_ns_est = w->bk / delta; + w->bw_est = w->bw_ns_est; + } else { + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + } } /* -- cgit v1.1 From bc726a71d2799f0f8b68a17f49d86aa030f64abc Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:02:19 -0700 Subject: [TCP] Westwood: reset RTT min after FRTO RTT_min is updated each time a timeout event occurs in order to cope with hard handovers in wireless scenarios such as UMTS. Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index edf2ee1..4247da1 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -40,6 +40,7 @@ struct westwood { u32 rtt; u32 rtt_min; /* minimum observed RTT */ u8 first_ack; /* flag which infers that this is the first ack */ + u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; @@ -67,6 +68,7 @@ static void tcp_westwood_init(struct sock *sk) w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; + w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; @@ -142,6 +144,16 @@ static void westwood_update_window(struct sock *sk) } } +static inline void update_rtt_min(struct westwood *w) +{ + if (w->reset_rtt_min) { + w->rtt_min = w->rtt; + w->reset_rtt_min = 0; + } else + w->rtt_min = min(w->rtt, w->rtt_min); +} + + /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when @@ -157,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk) w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); } /* @@ -226,12 +238,14 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) case CA_EVENT_FRTO: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + /* Update RTT_min when next ack arrives */ + w->reset_rtt_min = 1; break; case CA_EVENT_SLOW_ACK: westwood_update_window(sk); w->bk += westwood_acked_count(sk); - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); break; default: -- cgit v1.1 From 35089bb203f44e33b6bbb6c4de0b0708f9a48921 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Jun 2006 22:33:04 -0700 Subject: [TCP]: Add tcp_slow_start_after_idle sysctl. A lot of people have asked for a way to disable tcp_cwnd_restart(), and it seems reasonable to add a sysctl to do that. Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp_output.c | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e7fb665..ce4cd5f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -690,6 +690,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, #endif + { + .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, + .procname = "tcp_slow_start_after_idle", + .data = &sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f33c9dd..07bb5a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3; int sysctl_tcp_mtu_probing = 0; int sysctl_tcp_base_mss = 512; +/* By default, RFC2861 behavior. */ +int sysctl_tcp_slow_start_after_idle = 1; + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + if (sysctl_tcp_slow_start_after_idle && + (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; -- cgit v1.1 From 8648b3053bff39a7ee4c711d74268079c928a657 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 22:06:05 -0700 Subject: [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM The current stack treats NETIF_F_HW_CSUM and NETIF_F_NO_CSUM identically so we test for them in quite a few places. For the sake of brevity, I'm adding the macro NETIF_F_GEN_CSUM for these two. We also test the disjunct of NETIF_F_IP_CSUM and the other two in various places, for that purpose I've added NETIF_F_ALL_CSUM. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_if.c | 3 +-- net/core/dev.c | 6 ++---- net/core/ethtool.c | 6 ++---- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp.c | 10 ++-------- 5 files changed, 8 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f5d47bf..90c95f5 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -376,8 +376,7 @@ void br_features_recompute(struct net_bridge *br) checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (!(p->dev->features - & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))) + if (!(p->dev->features & NETIF_F_ALL_CSUM)) checksum = 0; features &= p->dev->features; } diff --git a/net/core/dev.c b/net/core/dev.c index 91361bc..ab39fe1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,7 +1284,7 @@ int dev_queue_xmit(struct sk_buff *skb) * checksumming for this protocol, complete checksumming here. */ if (skb->ip_summed == CHECKSUM_HW && - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && + (!(dev->features & NETIF_F_GEN_CSUM) && (!(dev->features & NETIF_F_IP_CSUM) || skb->protocol != htons(ETH_P_IP)))) if (skb_checksum_help(skb, 0)) @@ -2789,9 +2789,7 @@ int register_netdevice(struct net_device *dev) /* Fix illegal SG+CSUM combinations. */ if ((dev->features & NETIF_F_SG) && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) { + !(dev->features & NETIF_F_ALL_CSUM)) { printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", dev->name); dev->features &= ~NETIF_F_SG; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e6f7610..3f269e4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev) u32 ethtool_op_get_tx_csum(struct net_device *dev) { - return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0; + return (dev->features & NETIF_F_ALL_CSUM) != 0; } int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) @@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) + !(dev->features & NETIF_F_ALL_CSUM)) return -EINVAL; return __ethtool_set_sg(dev, edata.data); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d4bb3fa..8538aac 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -840,7 +840,7 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + rt->u.dst.dev->features & NETIF_F_ALL_CSUM && !exthdrlen) csummode = CHECKSUM_HW; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ff6ccda..74998f2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, ssize_t res; struct sock *sk = sock->sk; -#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) - if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) return sock_no_sendpage(sock, page, offset, size, flags); -#undef TCP_ZC_CSUM_FLAGS - lock_sock(sk); TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); @@ -726,9 +722,7 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & - (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM)) + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); -- cgit v1.1 From 2c6cc0d8539f121c3c75aa3641c19b67e8723379 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 22:06:45 -0700 Subject: [BRIDGE]: Add support for NETIF_F_HW_CSUM devices As it is the bridge will only ever declare NETIF_F_IP_CSUM even if all its constituent devices support NETIF_F_HW_CSUM. This patch fixes this by supporting the first one out of NETIF_F_NO_CSUM, NETIF_F_HW_CSUM, and NETIF_F_IP_CSUM that is supported by all constituent devices. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_device.c | 6 +++--- net/bridge/br_if.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0c88a2a..2afdc7c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -145,9 +145,9 @@ static int br_set_tx_csum(struct net_device *dev, u32 data) struct net_bridge *br = netdev_priv(dev); if (data) - br->feature_mask |= NETIF_F_IP_CSUM; + br->feature_mask |= NETIF_F_NO_CSUM; else - br->feature_mask &= ~NETIF_F_IP_CSUM; + br->feature_mask &= ~NETIF_F_ALL_CSUM; br_features_recompute(br); return 0; @@ -185,5 +185,5 @@ void br_dev_setup(struct net_device *dev) dev->priv_flags = IFF_EBRIDGE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; + | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_NO_CSUM; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 90c95f5..fdec773 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -372,11 +372,17 @@ void br_features_recompute(struct net_bridge *br) struct net_bridge_port *p; unsigned long features, checksum; - features = br->feature_mask &~ NETIF_F_IP_CSUM; - checksum = br->feature_mask & NETIF_F_IP_CSUM; + checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0; + features = br->feature_mask & ~NETIF_F_ALL_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (!(p->dev->features & NETIF_F_ALL_CSUM)) + if (checksum & NETIF_F_NO_CSUM && + !(p->dev->features & NETIF_F_NO_CSUM)) + checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; + if (checksum & NETIF_F_HW_CSUM && + !(p->dev->features & NETIF_F_HW_CSUM)) + checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM; + if (!(p->dev->features & NETIF_F_IP_CSUM)) checksum = 0; features &= p->dev->features; } -- cgit v1.1 From b293acfd3133393a81bcd382eb71a210c9cf9526 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 17 Jun 2006 22:16:13 -0700 Subject: [IRDA]: Use put_unaligned() in irlmp_do_discovery(). irda_device_info->hints[] is byte aligned but is being accessed as a u16 Based upon a patch by Luke Yang . Signed-off-by: David S. Miller --- net/irda/irlmp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index c19e9ce..57ea160 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -44,6 +44,8 @@ #include #include +#include + static __u8 irlmp_find_free_slsap(void); static int irlmp_slsap_inuse(__u8 slsap_sel); @@ -840,6 +842,7 @@ void irlmp_do_expiry(void) void irlmp_do_discovery(int nslots) { struct lap_cb *lap; + __u16 *data_hintsp; /* Make sure the value is sane */ if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ @@ -849,7 +852,8 @@ void irlmp_do_discovery(int nslots) } /* Construct new discovery info to be used by IrLAP, */ - u16ho(irlmp->discovery_cmd.data.hints) = irlmp->hints.word; + data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints; + put_unaligned(irlmp->hints.word, data_hintsp); /* * Set character set for device name (we use ASCII), and -- cgit v1.1 From c5396a31b20991c856facbce18a2a56d1a14e8d0 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sat, 17 Jun 2006 22:48:48 -0700 Subject: [IPV6]: Sum real space for RTAs. This patch fixes RTNLGRP_IPV6_IFINFO netlink notifications. Issue pointed out by Patrick McHardy . Signed-off-by: YOSHIFUJI Hideaki Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 445006e..c2c26fa 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2860,6 +2860,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); } +/* Maximum length of ifa_cacheinfo attributes */ +#define INET6_IFADDR_RTA_SPACE \ + RTA_SPACE(16) /* IFA_ADDRESS */ + \ + RTA_SPACE(sizeof(struct ifa_cacheinfo)) /* CACHEINFO */ + static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, u32 pid, u32 seq, int event, unsigned int flags) { @@ -3092,7 +3097,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) { struct sk_buff *skb; - int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + INET6_IFADDR_RTA_SPACE); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { @@ -3142,6 +3147,17 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, #endif } +/* Maximum length of ifinfomsg attributes */ +#define INET6_IFINFO_RTA_SPACE \ + RTA_SPACE(IFNAMSIZ) /* IFNAME */ + \ + RTA_SPACE(MAX_ADDR_LEN) /* ADDRESS */ + \ + RTA_SPACE(sizeof(u32)) /* MTU */ + \ + RTA_SPACE(sizeof(int)) /* LINK */ + \ + RTA_SPACE(0) /* PROTINFO */ + \ + RTA_SPACE(sizeof(u32)) /* FLAGS */ + \ + RTA_SPACE(sizeof(struct ifla_cacheinfo)) /* CACHEINFO */ + \ + RTA_SPACE(sizeof(__s32[DEVCONF_MAX])) /* CONF */ + static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, u32 pid, u32 seq, int event, unsigned int flags) { @@ -3235,8 +3251,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) void inet6_ifinfo_notify(int event, struct inet6_dev *idev) { struct sk_buff *skb; - /* 128 bytes ?? */ - int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128); + int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + INET6_IFINFO_RTA_SPACE); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { @@ -3252,6 +3267,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); } +/* Maximum length of prefix_cacheinfo attributes */ +#define INET6_PREFIX_RTA_SPACE \ + RTA_SPACE(sizeof(((struct prefix_info *)NULL)->prefix)) /* ADDRESS */ + \ + RTA_SPACE(sizeof(struct prefix_cacheinfo)) /* CACHEINFO */ + static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, struct prefix_info *pinfo, u32 pid, u32 seq, int event, unsigned int flags) @@ -3296,7 +3316,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo) { struct sk_buff *skb; - int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128); + int size = NLMSG_SPACE(sizeof(struct prefixmsg) + INET6_PREFIX_RTA_SPACE); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { -- cgit v1.1 From 402d68c43326d2f0e7e2e9a9013cd4c098d9b87c Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 17 Jun 2006 22:54:51 -0700 Subject: [SCTP]: Limit association max_retrans setting in setsockopt. When using ASSOCINFO socket option, we need to limit the number of maximum association retransmissions to be no greater than the sum of all the path retransmissions. This is specified in Section 7.1.2 of the SCTP socket API draft. However, we only do this if the association has multiple paths. If there is only one path, the protocol stack will use the assoc_max_retrans setting when trying to retransmit packets. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/socket.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 174d4d3..b41dcbb 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2530,8 +2530,32 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int o /* Set the values to the specific association */ if (asoc) { - if (assocparams.sasoc_asocmaxrxt != 0) + if (assocparams.sasoc_asocmaxrxt != 0) { + __u32 path_sum = 0; + int paths = 0; + struct list_head *pos; + struct sctp_transport *peer_addr; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + peer_addr = list_entry(pos, + struct sctp_transport, + transports); + path_sum += peer_addr->pathmaxrxt; + paths++; + } + + /* Only validate asocmaxrxt if we have more then + * one path/transport. We do this because path + * retransmissions are only counted when we have more + * then one path. + */ + if (paths > 1 && + assocparams.sasoc_asocmaxrxt > path_sum) + return -EINVAL; + asoc->max_retrans = assocparams.sasoc_asocmaxrxt; + } + if (assocparams.sasoc_cookie_life != 0) { asoc->cookie_life.tv_sec = assocparams.sasoc_cookie_life / 1000; -- cgit v1.1 From 5636bef7324f49e36f05ec8a5f6284e11b1bcca4 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 17 Jun 2006 22:55:35 -0700 Subject: [SCTP]: Reject sctp packets with broadcast addresses. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/input.c | 3 ++- net/sctp/ipv6.c | 6 ++++-- net/sctp/protocol.c | 8 +++++++- net/sctp/socket.c | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 1662f9c..70d6606 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -170,7 +170,8 @@ int sctp_rcv(struct sk_buff *skb) * IP broadcast addresses cannot be used in an SCTP transport * address." */ - if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL)) + if (!af->addr_valid(&src, NULL, skb) || + !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index c20d282..8ef0807 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ -static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) +static int sctp_v6_addr_valid(union sctp_addr *addr, + struct sctp_sock *sp, + const struct sk_buff *skb) { int ret = ipv6_addr_type(&addr->v6.sin6_addr); @@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); - return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp); + return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); } /* Is this a non-unicast address */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 2088aa9..816c033 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr) * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ -static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) +static int sctp_v4_addr_valid(union sctp_addr *addr, + struct sctp_sock *sp, + const struct sk_buff *skb) { /* Is this a non-unicast address or a unusable SCTP address? */ if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) return 0; + /* Is this a broadcast address? */ + if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST) + return 0; + return 1; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b41dcbb..b811691c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, return -EINVAL; /* Is this a valid SCTP address? */ - if (!af->addr_valid(addr, sctp_sk(sk))) + if (!af->addr_valid(addr, sctp_sk(sk), NULL)) return -EINVAL; if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) -- cgit v1.1 From 4c9f5d5305a23851e67471b147e0d459a7166717 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 17 Jun 2006 22:56:08 -0700 Subject: [SCTP] Reset rtt_in_progress for the chunk when processing its sack. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index f148f95..e5faa35 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, if (!tchunk->tsn_gap_acked && !tchunk->resent && tchunk->rtt_in_progress) { + tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); -- cgit v1.1 From 503b55fd77d11381b1950d1651d3bc782c0cc2cd Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Sat, 17 Jun 2006 22:57:28 -0700 Subject: [SCTP]: Don't do CRC32C checksum over loopback. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/input.c | 3 ++- net/sctp/output.c | 48 +++++++++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 70d6606..42b66e7 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -141,7 +141,8 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb->h.raw - skb->data); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (sctp_rcv_checksum(skb) < 0) + if ((skb->ip_summed != CHECKSUM_UNNECESSARY) && + (sctp_rcv_checksum(skb) < 0)) goto discard_it; skb_pull(skb, sizeof(struct sctphdr)); diff --git a/net/sctp/output.c b/net/sctp/output.c index 437cba7..cdc5a39 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -295,14 +295,14 @@ int sctp_packet_transmit(struct sctp_packet *packet) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - __u32 crc32; + __u32 crc32 = 0; struct sk_buff *nskb; struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ __u8 has_data = 0; - struct dst_entry *dst; + struct dst_entry *dst = tp->dst; SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); @@ -327,6 +327,19 @@ int sctp_packet_transmit(struct sctp_packet *packet) */ skb_set_owner_w(nskb, sk); + /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ + if (!dst || (dst->obsolete > 1)) { + dst_release(dst); + sctp_transport_route(tp, NULL, sctp_sk(sk)); + if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) { + sctp_assoc_sync_pmtu(asoc); + } + } + nskb->dst = dst_clone(tp->dst); + if (!nskb->dst) + goto no_route; + dst = nskb->dst; + /* Build the SCTP header. */ sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); sh->source = htons(packet->source_port); @@ -350,7 +363,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . */ - crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); + if (!(dst->dev->features & NETIF_F_NO_CSUM)) + crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); /** * 6.10 Bundling @@ -402,9 +416,14 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (padding) memset(skb_put(chunk->skb, padding), 0, padding); - crc32 = sctp_update_copy_cksum(skb_put(nskb, chunk->skb->len), - chunk->skb->data, - chunk->skb->len, crc32); + if (dst->dev->features & NETIF_F_NO_CSUM) + memcpy(skb_put(nskb, chunk->skb->len), + chunk->skb->data, chunk->skb->len); + else + crc32 = sctp_update_copy_cksum(skb_put(nskb, + chunk->skb->len), + chunk->skb->data, + chunk->skb->len, crc32); SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", "*** Chunk", chunk, @@ -427,7 +446,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) } /* Perform final transformation on checksum. */ - crc32 = sctp_end_cksum(crc32); + if (!(dst->dev->features & NETIF_F_NO_CSUM)) + crc32 = sctp_end_cksum(crc32); /* 3) Put the resultant value into the checksum field in the * common header, and leave the rest of the bits unchanged. @@ -477,20 +497,6 @@ int sctp_packet_transmit(struct sctp_packet *packet) } } - dst = tp->dst; - /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ - if (!dst || (dst->obsolete > 1)) { - dst_release(dst); - sctp_transport_route(tp, NULL, sctp_sk(sk)); - if (asoc->param_flags & SPP_PMTUD_ENABLE) { - sctp_assoc_sync_pmtu(asoc); - } - } - - nskb->dst = dst_clone(tp->dst); - if (!nskb->dst) - goto no_route; - SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); -- cgit v1.1 From d7c2c9e3977e4312d093ac092761798d4d47c9e0 Mon Sep 17 00:00:00 2001 From: Tsutomu Fujii Date: Sat, 17 Jun 2006 22:58:28 -0700 Subject: [SCTP]: Send only 1 window update SACK per message. Right now, every time we increase our rwnd by more then MTU bytes, we trigger a SACK. When processing large messages, this will generate a SACK for almost every other SCTP fragment. However since we are freeing the entire message at the same time, we might as well collapse the SACK generation to 1. Signed-off-by: Tsutomu Fujii Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/ulpevent.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index ba97f97..ee236784 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -51,6 +51,8 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); +static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); + /* Initialize an ULP event from an given skb. */ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) @@ -883,6 +885,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; + unsigned int len; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as @@ -892,7 +895,30 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) */ skb = sctp_event2skb(event); - sctp_assoc_rwnd_increase(event->asoc, skb_headlen(skb)); + len = skb->len; + + if (!skb->data_len) + goto done; + + /* Don't forget the fragments. */ + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* NOTE: skb_shinfos are recursive. Although IP returns + * skb's with only 1 level of fragments, SCTP reassembly can + * increase the levels. + */ + sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); + } + +done: + sctp_assoc_rwnd_increase(event->asoc, len); + sctp_ulpevent_release_owner(event); +} + +static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) +{ + struct sk_buff *skb, *frag; + + skb = sctp_event2skb(event); if (!skb->data_len) goto done; @@ -903,7 +929,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) * skb's with only 1 level of fragments, SCTP reassembly can * increase the levels. */ - sctp_ulpevent_release_data(sctp_skb2event(frag)); + sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); } done: -- cgit v1.1 From d5b9f4c083b0e3102f3101545279f623680cb3a0 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Sat, 17 Jun 2006 22:59:03 -0700 Subject: [SCTP]: Fix persistent slowdown in sctp when a gap ack consumes rx buffer. In the event that our entire receive buffer is full with a series of chunks that represent a single gap-ack, and then we accept a chunk (or chunks) that fill in the gap between the ctsn and the first gap, we renege chunks from the end of the buffer, which effectively does nothing but move our gap to the end of our received tsn stream. This does little but move our missing tsns down stream a little, and, if the sender is sending sufficiently large retransmit frames, the result is a perpetual slowdown which can never be recovered from, since the only chunk that can be accepted to allow progress in the tsn stream necessitates that a new gap be created to make room for it. This leads to a constant need for retransmits, and subsequent receiver stalls. The fix I've come up with is to deliver the frame without reneging if we have a full receive buffer and the receiving sockets sk_receive_queue is empty(indicating that the receive buffer is being blocked by a missing tsn). Signed-off-by: Neil Horman Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8bc2792..9e58144 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5293,10 +5293,18 @@ static int sctp_eat_data(const struct sctp_association *asoc, * seems a bit troublesome in that frag_point varies based on * PMTU. In cases, such as loopback, this might be a rather * large spill over. + * NOTE: If we have a full receive buffer here, we only renege if + * our receiver can still make progress without the tsn being + * received. We do this because in the event that the associations + * receive queue is empty we are filling a leading gap, and since + * reneging moves the gap to the end of the tsn stream, we are likely + * to stall again very shortly. Avoiding the renege when we fill a + * leading gap is a good heuristic for avoiding such steady state + * stalls. */ if (!asoc->rwnd || asoc->rwnd_over || (datalen > asoc->rwnd + asoc->frag_point) || - rcvbuf_over) { + (rcvbuf_over && (!skb_queue_len(&sk->sk_receive_queue)))) { /* If this is the next TSN, consider reneging to make * room. Note: Playing nice with a confused sender. A -- cgit v1.1 From 47552c4e555eefe381f3d45140b59a2ea4b16486 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 23:00:20 -0700 Subject: [ETHTOOL]: Fix UFO typo The function ethtool_get_ufo was referring to ETHTOOL_GTSO instead of ETHTOOL_GUFO. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3f269e4..33ce7ed 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -589,7 +589,7 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) { - struct ethtool_value edata = { ETHTOOL_GTSO }; + struct ethtool_value edata = { ETHTOOL_GUFO }; if (!dev->ethtool_ops->get_ufo) return -EOPNOTSUPP; @@ -598,6 +598,7 @@ static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) return -EFAULT; return 0; } + static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata; -- cgit v1.1