diff options
-rw-r--r-- | drivers/net/tg3.c | 15 | ||||
-rw-r--r-- | include/linux/sysctl.h | 3 | ||||
-rw-r--r-- | include/net/dn.h | 4 | ||||
-rw-r--r-- | net/core/dev.c | 3 | ||||
-rw-r--r-- | net/core/skbuff.c | 2 | ||||
-rw-r--r-- | net/decnet/af_decnet.c | 25 | ||||
-rw-r--r-- | net/decnet/sysctl_net_decnet.c | 33 | ||||
-rw-r--r-- | net/ipv4/netfilter/Kconfig | 8 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_core.c | 20 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_netlink.c | 12 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 233 | ||||
-rw-r--r-- | net/ipv4/tcp_vegas.c | 16 | ||||
-rw-r--r-- | net/ipv6/esp6.c | 2 | ||||
-rw-r--r-- | net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 12 | ||||
-rw-r--r-- | net/netfilter/Kconfig | 4 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 3 | ||||
-rw-r--r-- | net/netfilter/nfnetlink.c | 5 | ||||
-rw-r--r-- | net/packet/af_packet.c | 115 |
19 files changed, 299 insertions, 219 deletions
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 1828a6b..47bd4a3 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -68,8 +68,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.43" -#define DRV_MODULE_RELDATE "Oct 24, 2005" +#define DRV_MODULE_VERSION "3.44" +#define DRV_MODULE_RELDATE "Dec 6, 2005" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 @@ -3565,12 +3565,15 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) if (!spin_trylock(&tp->tx_lock)) return NETDEV_TX_LOCKED; - /* This is a hard error, log it. */ if (unlikely(TX_BUFFS_AVAIL(tp) <= (skb_shinfo(skb)->nr_frags + 1))) { - netif_stop_queue(dev); + if (!netif_queue_stopped(dev)) { + netif_stop_queue(dev); + + /* This is a hard error, log it. */ + printk(KERN_ERR PFX "%s: BUG! Tx Ring full when " + "queue awake!\n", dev->name); + } spin_unlock(&tp->tx_lock); - printk(KERN_ERR PFX "%s: BUG! Tx Ring full when queue awake!\n", - dev->name); return NETDEV_TX_BUSY; } diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6bc03c9..4be34ef 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -670,6 +670,9 @@ enum { NET_DECNET_DST_GC_INTERVAL = 9, NET_DECNET_CONF = 10, NET_DECNET_NO_FC_MAX_CWND = 11, + NET_DECNET_MEM = 12, + NET_DECNET_RMEM = 13, + NET_DECNET_WMEM = 14, NET_DECNET_DEBUG_LEVEL = 255 }; diff --git a/include/net/dn.h b/include/net/dn.h index c1dbbd2..a4b6168 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -234,4 +234,8 @@ extern int decnet_di_count; extern int decnet_dr_count; extern int decnet_no_fc_max_cwnd; +extern int sysctl_decnet_mem[3]; +extern int sysctl_decnet_wmem[3]; +extern int sysctl_decnet_rmem[3]; + #endif /* _NET_DN_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 0b48e29..a5efc9a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1113,7 +1113,8 @@ out: void netdev_rx_csum_fault(struct net_device *dev) { if (net_ratelimit()) { - printk(KERN_ERR "%s: hw csum failure.\n", dev->name); + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : "<unknown>"); dump_stack(); } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b7d13a4..83fee37 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1725,7 +1725,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, * of the skb if any page alloc fails user this procedure returns -ENOMEM */ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int getfrag(void *from, char *to, int offset, + int (*getfrag)(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index f89e55f..d402e90 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -153,6 +153,7 @@ static struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; +static atomic_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); @@ -446,10 +447,26 @@ static void dn_destruct(struct sock *sk) dst_release(xchg(&sk->sk_dst_cache, NULL)); } +static int dn_memory_pressure; + +static void dn_enter_memory_pressure(void) +{ + if (!dn_memory_pressure) { + dn_memory_pressure = 1; + } +} + static struct proto dn_proto = { - .name = "DECNET", - .owner = THIS_MODULE, - .obj_size = sizeof(struct dn_sock), + .name = "NSP", + .owner = THIS_MODULE, + .enter_memory_pressure = dn_enter_memory_pressure, + .memory_pressure = &dn_memory_pressure, + .memory_allocated = &decnet_memory_allocated, + .sysctl_mem = sysctl_decnet_mem, + .sysctl_wmem = sysctl_decnet_wmem, + .sysctl_rmem = sysctl_decnet_rmem, + .max_header = DN_MAX_NSP_DATA_HEADER + 64, + .obj_size = sizeof(struct dn_sock), }; static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) @@ -470,6 +487,8 @@ static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; + sk->sk_sndbuf = sysctl_decnet_wmem[1]; + sk->sk_rcvbuf = sysctl_decnet_rmem[1]; /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 02bca49..0e9d2c5 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -10,6 +10,7 @@ * * Changes: * Steve Whitehouse - C99 changes and default device handling + * Steve Whitehouse - Memory buffer settings, like the tcp ones * */ #include <linux/config.h> @@ -37,6 +38,11 @@ int decnet_dr_count = 3; int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; +/* Reasonable defaults, I hope, based on tcp's defaults */ +int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + #ifdef CONFIG_SYSCTL extern int decnet_dst_gc_interval; static int min_decnet_time_wait[] = { 5 }; @@ -428,6 +434,33 @@ static ctl_table dn_table[] = { .extra1 = &min_decnet_no_fc_max_cwnd, .extra2 = &max_decnet_no_fc_max_cwnd }, + { + .ctl_name = NET_DECNET_MEM, + .procname = "decnet_mem", + .data = &sysctl_decnet_mem, + .maxlen = sizeof(sysctl_decnet_mem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_RMEM, + .procname = "decnet_rmem", + .data = &sysctl_decnet_rmem, + .maxlen = sizeof(sysctl_decnet_rmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_WMEM, + .procname = "decnet_wmem", + .data = &sysctl_decnet_wmem, + .maxlen = sizeof(sysctl_decnet_wmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, { .ctl_name = NET_DECNET_DEBUG_LEVEL, .procname = "debug", diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 0bc0052..88a6065 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK instead of the individual packets. config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on IP_NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS IF unsure, say `N'. config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface' - depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m help This option enables support for a netlink-based userspace interface diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 7a4ecdd..84c66db 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { if (vmalloced) @@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush(void) +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) { + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1363,7 +1372,7 @@ void ip_conntrack_flush(void) ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1371,14 +1380,7 @@ void ip_conntrack_flush(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); -} -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - ip_ct_attach = NULL; - ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 3fce91b..91fe8f2 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -503,7 +503,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) } static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), @@ -528,7 +528,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, if (!tb[CTA_PROTO_NUM-1]) return -EINVAL; - tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); @@ -728,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; } } - if (del_timer(&ct->timeout)) { - ip_conntrack_put(ct); + if (del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct); - return 0; - } + ip_conntrack_put(ct); DEBUGP("leaving\n"); @@ -877,7 +875,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) DEBUGP("NAT status: %lu\n", status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - if (ip_nat_initialized(ct, hooknum)) + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) return -EEXIST; ip_nat_setup_info(ct, &range, hooknum); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index aeb7353..e7fa29e 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s, static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, const struct ip_conntrack *ct) { - struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + struct nfattr *nest_parms; read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), &ct->proto.tcp.state); read_unlock_bh(&tcp_lock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 029c70d..b7325e0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { - if (skb != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - int tcp_header_size = tp->tcp_header_len; - struct tcphdr *th; - int sysctl_flags; - int err; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + int tcp_header_size; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); + + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->rtt_sample) + __net_timestamp(skb); + + if (likely(clone_it)) { + if (unlikely(skb_cloned(skb))) + skb = pskb_copy(skb, gfp_mask); + else + skb = skb_clone(skb, gfp_mask); + if (unlikely(!skb)) + return -ENOBUFS; + } - BUG_ON(!tcp_skb_pcount(skb)); + inet = inet_sk(sk); + tp = tcp_sk(sk); + tcb = TCP_SKB_CB(skb); + tcp_header_size = tp->tcp_header_len; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 - /* If congestion control is doing timestamping */ - if (icsk->icsk_ca_ops->rtt_sample) - __net_timestamp(skb); - - sysctl_flags = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if(sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if(sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (tp->rx_opt.eff_sacks) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + sysctl_flags = 0; + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } - - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; - skb_set_owner_w(skb, sk); - - /* Build TCP header and checksum it. */ - th->source = inet->sport; - th->dest = inet->dport; - th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); - *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (tcb->flags & TCPCB_FLAG_SYN) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(tp->rcv_wnd); - } else { - th->window = htons(tcp_select_window(sk)); + if (sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; } - th->check = 0; - th->urg_ptr = 0; - - if (tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); - th->urg = 1; + if (sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } + } else if (unlikely(tp->rx_opt.eff_sacks)) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK)); + } + + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->flags); + + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_syn_build_options((__u32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent); - } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, tcb->when); + if (unlikely(tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } - TCP_ECN_send(sk, tp, skb, tcp_header_size); - } - tp->af_specific->send_check(sk, th, skb->len, skb); + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } - if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tp->af_specific->send_check(sk, th, skb->len, skb); - if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + if (likely(tcb->flags & TCPCB_FLAG_ACK)) + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); - err = tp->af_specific->queue_xmit(skb, 0); - if (err <= 0) - return err; + TCP_INC_STATS(TCP_MIB_OUTSEGS); - tcp_enter_cwr(sk); + err = tp->af_specific->queue_xmit(skb, 0); + if (unlikely(err <= 0)) + return err; + + tcp_enter_cwr(sk); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; - /* NET_XMIT_CN is special. It does not guarantee, - * that this packet is lost. It tells that device - * is about to start to drop packets or already - * drops some packets of the same priority and - * invokes us to send less aggressively. - */ - return err == NET_XMIT_CN ? 0 : err; - } - return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. @@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); return; @@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, (skb_cloned(skb) ? - pskb_copy(skb, GFP_ATOMIC): - skb_clone(skb, GFP_ATOMIC))); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (err == 0) { /* Update global TCP statistics. */ @@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (tcp_transmit_skb(sk, skb)) + if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } @@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /* @@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk) __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ @@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff); + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } } @@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) @@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { update_send_head(sk, tp, skb); } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index b7d296a..13e7e6e 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->beg_snd_nxt = tp->snd_nxt; vegas->beg_snd_cwnd = tp->snd_cwnd; - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - tcp_vegas_rtt_calc(sk, seq_rtt * 1000); - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. @@ -333,11 +325,11 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 40d9a19..8bfbe99 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -248,7 +248,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) if (esp->conf.padlen) mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_full_len; + return mtu + x->props.header_len + esp->auth.icv_trunc_len; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index c0f1da5..a7e03cf 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -68,8 +68,8 @@ static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 }; - __u8 type = orig->dst.u.icmp.type - 128; - if (type >= sizeof(invmap) || !invmap[type]) + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return 0; tuple->src.u.icmp.id = orig->src.u.icmp.id; @@ -129,12 +129,12 @@ static int icmpv6_new(struct nf_conn *conntrack, [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; + int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) - || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ - DEBUGP("icmp: can't create new conn with type %u\n", - conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DEBUGP("icmpv6: can't create new conn with type %u\n", + type + 128); NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a84f922..794c41d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -61,8 +61,8 @@ config NF_CONNTRACK_MARK instead of the individual packets. config NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1da6783..a7c7b49 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1383,6 +1383,9 @@ void nf_conntrack_cleanup(void) schedule(); goto i_see_dead_people; } + /* wait until all references to nf_conntrack_untracked are dropped */ + while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + schedule(); for (i = 0; i < NF_CT_F_NUM; i++) { if (nf_ct_cache[i].use == 0) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a60c59b..95fdf04 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -162,7 +162,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, return -EINVAL; } - min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); if (unlikely(nlh->nlmsg_len < min_len)) return -EINVAL; @@ -236,8 +236,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, } /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) { DEBUGP("received message was too short\n"); return 0; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 499ae3d..3e24627 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1587,23 +1587,47 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); } -static void free_pg_vec(char **pg_vec, unsigned order, unsigned len) +static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) { int i; - for (i=0; i<len; i++) { - if (pg_vec[i]) { - struct page *page, *pend; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - ClearPageReserved(page); - free_pages((unsigned long)pg_vec[i], order); - } + for (i = 0; i < len; i++) { + if (likely(pg_vec[i])) + free_pages((unsigned long) pg_vec[i], order); } kfree(pg_vec); } +static inline char *alloc_one_pg_vec_page(unsigned long order) +{ + return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO, + order); +} + +static char **alloc_pg_vec(struct tpacket_req *req, int order) +{ + unsigned int block_nr = req->tp_block_nr; + char **pg_vec; + int i; + + pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); + if (unlikely(!pg_vec)) + goto out; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = alloc_one_pg_vec_page(order); + if (unlikely(!pg_vec[i])) + goto out_free_pgvec; + } + +out: + return pg_vec; + +out_free_pgvec: + free_pg_vec(pg_vec, order, block_nr); + pg_vec = NULL; + goto out; +} static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) { @@ -1617,64 +1641,46 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing /* Sanity tests and some calculations */ - if (po->pg_vec) + if (unlikely(po->pg_vec)) return -EBUSY; - if ((int)req->tp_block_size <= 0) + if (unlikely((int)req->tp_block_size <= 0)) return -EINVAL; - if (req->tp_block_size&(PAGE_SIZE-1)) + if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) return -EINVAL; - if (req->tp_frame_size < TPACKET_HDRLEN) + if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) return -EINVAL; - if (req->tp_frame_size&(TPACKET_ALIGNMENT-1)) + if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) return -EINVAL; po->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (po->frames_per_block <= 0) + if (unlikely(po->frames_per_block <= 0)) return -EINVAL; - if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr) + if (unlikely((po->frames_per_block * req->tp_block_nr) != + req->tp_frame_nr)) return -EINVAL; - /* OK! */ - - /* Allocate page vector */ - while ((PAGE_SIZE<<order) < req->tp_block_size) - order++; err = -ENOMEM; - - pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL); - if (pg_vec == NULL) + order = get_order(req->tp_block_size); + pg_vec = alloc_pg_vec(req, order); + if (unlikely(!pg_vec)) goto out; - memset(pg_vec, 0, req->tp_block_nr*sizeof(char **)); - - for (i=0; i<req->tp_block_nr; i++) { - struct page *page, *pend; - pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order); - if (!pg_vec[i]) - goto out_free_pgvec; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - SetPageReserved(page); - } - /* Page vector is allocated */ l = 0; - for (i=0; i<req->tp_block_nr; i++) { + for (i = 0; i < req->tp_block_nr; i++) { char *ptr = pg_vec[i]; struct tpacket_hdr *header; int k; - for (k=0; k<po->frames_per_block; k++) { - - header = (struct tpacket_hdr*)ptr; + for (k = 0; k < po->frames_per_block; k++) { + header = (struct tpacket_hdr *) ptr; header->tp_status = TP_STATUS_KERNEL; ptr += req->tp_frame_size; } } /* Done */ } else { - if (req->tp_frame_nr) + if (unlikely(req->tp_frame_nr)) return -EINVAL; } @@ -1701,7 +1707,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing spin_lock_bh(&sk->sk_receive_queue.lock); pg_vec = XC(po->pg_vec, pg_vec); - po->frame_max = req->tp_frame_nr-1; + po->frame_max = (req->tp_frame_nr - 1); po->head = 0; po->frame_size = req->tp_frame_size; spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1728,7 +1734,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing release_sock(sk); -out_free_pgvec: if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: @@ -1755,17 +1760,19 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) goto out; - atomic_inc(&po->mapped); start = vma->vm_start; - err = -EAGAIN; - for (i=0; i<po->pg_vec_len; i++) { - if (remap_pfn_range(vma, start, - __pa(po->pg_vec[i]) >> PAGE_SHIFT, - po->pg_vec_pages*PAGE_SIZE, - vma->vm_page_prot)) - goto out; - start += po->pg_vec_pages*PAGE_SIZE; + for (i = 0; i < po->pg_vec_len; i++) { + struct page *page = virt_to_page(po->pg_vec[i]); + int pg_num; + + for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { + err = vm_insert_page(vma, start, page); + if (unlikely(err)) + goto out; + start += PAGE_SIZE; + } } + atomic_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; |