diff options
Diffstat (limited to 'net/ipv4/ip_fragment.c')
-rw-r--r-- | net/ipv4/ip_fragment.c | 691 |
1 files changed, 691 insertions, 0 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c new file mode 100644 index 0000000..7f68e27 --- /dev/null +++ b/net/ipv4/ip_fragment.c @@ -0,0 +1,691 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP fragmentation functionality. + * + * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ + * + * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : Split from ip.c , see ip_input.c for history. + * David S. Miller : Begin massive cleanup... + * Andi Kleen : Add sysctls. + * xxxx : Overlapfrag bug. + * Ultima : ip_expire() kernel panic. + * Bill Hawes : Frag accounting and evictor fixes. + * John McDonald : 0 length frag bug. + * Alexey Kuznetsov: SMP races, threading, cleanup. + * Patrick McHardy : LRU queue of frag heads for evictor. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/jiffies.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/checksum.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/inet.h> +#include <linux/netfilter_ipv4.h> + +/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c + * as well. Or notify me, at least. --ANK + */ + +/* Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to measurably + * harm machine performance. + */ +int sysctl_ipfrag_high_thresh = 256*1024; +int sysctl_ipfrag_low_thresh = 192*1024; + +/* Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. + */ +int sysctl_ipfrag_time = IP_FRAG_TIME; + +struct ipfrag_skb_cb +{ + struct inet_skb_parm h; + int offset; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) + +/* Describe an entry in the "incomplete datagrams" queue. */ +struct ipq { + struct ipq *next; /* linked list pointers */ + struct list_head lru_list; /* lru list member */ + u32 user; + u32 saddr; + u32 daddr; + u16 id; + u8 protocol; + u8 last_in; +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + + struct sk_buff *fragments; /* linked list of received fragments */ + int len; /* total length of original datagram */ + int meat; + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* when will this queue expire? */ + struct ipq **pprev; + int iif; + struct timeval stamp; +}; + +/* Hash table. */ + +#define IPQ_HASHSZ 64 + +/* Per-bucket lock is easy to add now. */ +static struct ipq *ipq_hash[IPQ_HASHSZ]; +static DEFINE_RWLOCK(ipfrag_lock); +static u32 ipfrag_hash_rnd; +static LIST_HEAD(ipq_lru_list); +int ip_frag_nqueues = 0; + +static __inline__ void __ipq_unlink(struct ipq *qp) +{ + if(qp->next) + qp->next->pprev = qp->pprev; + *qp->pprev = qp->next; + list_del(&qp->lru_list); + ip_frag_nqueues--; +} + +static __inline__ void ipq_unlink(struct ipq *ipq) +{ + write_lock(&ipfrag_lock); + __ipq_unlink(ipq); + write_unlock(&ipfrag_lock); +} + +static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot) +{ + return jhash_3words((u32)id << 16 | prot, saddr, daddr, + ipfrag_hash_rnd) & (IPQ_HASHSZ - 1); +} + +static struct timer_list ipfrag_secret_timer; +int sysctl_ipfrag_secret_interval = 10 * 60 * HZ; + +static void ipfrag_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&ipfrag_lock); + get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); + for (i = 0; i < IPQ_HASHSZ; i++) { + struct ipq *q; + + q = ipq_hash[i]; + while (q) { + struct ipq *next = q->next; + unsigned int hval = ipqhashfn(q->id, q->saddr, + q->daddr, q->protocol); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = ipq_hash[hval]) != NULL) + q->next->pprev = &q->next; + ipq_hash[hval] = q; + q->pprev = &ipq_hash[hval]; + } + + q = next; + } + } + write_unlock(&ipfrag_lock); + + mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval); +} + +atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ + +/* Memory Tracking Functions. */ +static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &ip_frag_mem); + kfree_skb(skb); +} + +static __inline__ void frag_free_queue(struct ipq *qp, int *work) +{ + if (work) + *work -= sizeof(struct ipq); + atomic_sub(sizeof(struct ipq), &ip_frag_mem); + kfree(qp); +} + +static __inline__ struct ipq *frag_alloc_queue(void) +{ + struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); + + if(!qp) + return NULL; + atomic_add(sizeof(struct ipq), &ip_frag_mem); + return qp; +} + + +/* Destruction primitives. */ + +/* Complete destruction of ipq. */ +static void ip_frag_destroy(struct ipq *qp, int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(qp->last_in&COMPLETE); + BUG_TRAP(del_timer(&qp->timer) == 0); + + /* Release all fragment data. */ + fp = qp->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + /* Finally, release the queue descriptor itself. */ + frag_free_queue(qp, work); +} + +static __inline__ void ipq_put(struct ipq *ipq, int *work) +{ + if (atomic_dec_and_test(&ipq->refcnt)) + ip_frag_destroy(ipq, work); +} + +/* Kill ipq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static void ipq_kill(struct ipq *ipq) +{ + if (del_timer(&ipq->timer)) + atomic_dec(&ipq->refcnt); + + if (!(ipq->last_in & COMPLETE)) { + ipq_unlink(ipq); + atomic_dec(&ipq->refcnt); + ipq->last_in |= COMPLETE; + } +} + +/* Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the threshold. + */ +static void ip_evictor(void) +{ + struct ipq *qp; + struct list_head *tmp; + int work; + + work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; + if (work <= 0) + return; + + while (work > 0) { + read_lock(&ipfrag_lock); + if (list_empty(&ipq_lru_list)) { + read_unlock(&ipfrag_lock); + return; + } + tmp = ipq_lru_list.next; + qp = list_entry(tmp, struct ipq, lru_list); + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + + spin_lock(&qp->lock); + if (!(qp->last_in&COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->lock); + + ipq_put(qp, &work); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } +} + +/* + * Oops, a fragment queue timed out. Kill it and send an ICMP reply. + */ +static void ip_expire(unsigned long arg) +{ + struct ipq *qp = (struct ipq *) arg; + + spin_lock(&qp->lock); + + if (qp->last_in & COMPLETE) + goto out; + + ipq_kill(qp); + + IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + + if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { + struct sk_buff *head = qp->fragments; + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + if ((head->dev = dev_get_by_index(qp->iif)) != NULL) { + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + dev_put(head->dev); + } + } +out: + spin_unlock(&qp->lock); + ipq_put(qp, NULL); +} + +/* Creation primitives. */ + +static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +{ + struct ipq *qp; + + write_lock(&ipfrag_lock); +#ifdef CONFIG_SMP + /* With SMP race we have to recheck hash table, because + * such entry could be created on other cpu, while we + * promoted read lock to write lock. + */ + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == qp_in->id && + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { + atomic_inc(&qp->refcnt); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; + ipq_put(qp_in, NULL); + return qp; + } + } +#endif + qp = qp_in; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) + atomic_inc(&qp->refcnt); + + atomic_inc(&qp->refcnt); + if((qp->next = ipq_hash[hash]) != NULL) + qp->next->pprev = &qp->next; + ipq_hash[hash] = qp; + qp->pprev = &ipq_hash[hash]; + INIT_LIST_HEAD(&qp->lru_list); + list_add_tail(&qp->lru_list, &ipq_lru_list); + ip_frag_nqueues++; + write_unlock(&ipfrag_lock); + return qp; +} + +/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ +static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) +{ + struct ipq *qp; + + if ((qp = frag_alloc_queue()) == NULL) + goto out_nomem; + + qp->protocol = iph->protocol; + qp->last_in = 0; + qp->id = iph->id; + qp->saddr = iph->saddr; + qp->daddr = iph->daddr; + qp->user = user; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + /* Initialize a timer for this entry. */ + init_timer(&qp->timer); + qp->timer.data = (unsigned long) qp; /* pointer to queue */ + qp->timer.function = ip_expire; /* expire function */ + spin_lock_init(&qp->lock); + atomic_set(&qp->refcnt, 1); + + return ip_frag_intern(hash, qp); + +out_nomem: + NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); + return NULL; +} + +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct iphdr *iph, u32 user) +{ + __u16 id = iph->id; + __u32 saddr = iph->saddr; + __u32 daddr = iph->daddr; + __u8 protocol = iph->protocol; + unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + struct ipq *qp; + + read_lock(&ipfrag_lock); + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && + qp->user == user) { + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + return qp; + } + } + read_unlock(&ipfrag_lock); + + return ip_frag_create(hash, iph, user); +} + +/* Add new segment to existing queue. */ +static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +{ + struct sk_buff *prev, *next; + int flags, offset; + int ihl, end; + + if (qp->last_in & COMPLETE) + goto err; + + offset = ntohs(skb->nh.iph->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = skb->nh.iph->ihl * 4; + + /* Determine the position of this fragment. */ + end = offset + skb->len - ihl; + + /* Is this the final fragment? */ + if ((flags & IP_MF) == 0) { + /* If we already have some bits beyond end + * or have different end, the segment is corrrupted. + */ + if (end < qp->len || + ((qp->last_in & LAST_IN) && end != qp->len)) + goto err; + qp->last_in |= LAST_IN; + qp->len = end; + } else { + if (end&7) { + end &= ~7; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + if (end > qp->len) { + /* Some bits beyond end -> corruption. */ + if (qp->last_in & LAST_IN) + goto err; + qp->len = end; + } + } + if (end == offset) + goto err; + + if (pskb_pull(skb, ihl) == NULL) + goto err; + if (pskb_trim(skb, end-offset)) + goto err; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = qp->fragments; next != NULL; next = next->next) { + if (FRAG_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + if (!pskb_pull(skb, i)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + while (next && FRAG_CB(next)->offset < end) { + int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + if (!pskb_pull(next, i)) + goto err; + FRAG_CB(next)->offset += i; + qp->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + qp->fragments = next; + + qp->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + FRAG_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + qp->fragments = skb; + + if (skb->dev) + qp->iif = skb->dev->ifindex; + skb->dev = NULL; + qp->stamp = skb->stamp; + qp->meat += skb->len; + atomic_add(skb->truesize, &ip_frag_mem); + if (offset == 0) + qp->last_in |= FIRST_IN; + + write_lock(&ipfrag_lock); + list_move_tail(&qp->lru_list, &ipq_lru_list); + write_unlock(&ipfrag_lock); + + return; + +err: + kfree_skb(skb); +} + + +/* Build a new IP datagram from all its fragments. */ + +static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) +{ + struct iphdr *iph; + struct sk_buff *fp, *head = qp->fragments; + int len; + int ihlen; + + ipq_kill(qp); + + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG_CB(head)->offset == 0); + + /* Allocate a new buffer for the datagram. */ + ihlen = head->nh.iph->ihl*4; + len = ihlen + qp->len; + + if(len > 65535) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_nomem; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_nomem; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; i<skb_shinfo(head)->nr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &ip_frag_mem); + } + + skb_shinfo(head)->frag_list = head->next; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &ip_frag_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &ip_frag_mem); + } + + head->next = NULL; + head->dev = dev; + head->stamp = qp->stamp; + + iph = head->nh.iph; + iph->frag_off = 0; + iph->tot_len = htons(len); + IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + qp->fragments = NULL; + return head; + +out_nomem: + NETDEBUG(if (net_ratelimit()) + printk(KERN_ERR + "IP: queue_glue: no memory for gluing queue %p\n", + qp)); + goto out_fail; +out_oversize: + if (net_ratelimit()) + printk(KERN_INFO + "Oversized IP packet from %d.%d.%d.%d.\n", + NIPQUAD(qp->saddr)); +out_fail: + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return NULL; +} + +/* Process an incoming IP datagram fragment. */ +struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) +{ + struct iphdr *iph = skb->nh.iph; + struct ipq *qp; + struct net_device *dev; + + IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + + /* Start by cleaning up the memory. */ + if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) + ip_evictor(); + + dev = skb->dev; + + /* Lookup (or create) queue header */ + if ((qp = ip_find(iph, user)) != NULL) { + struct sk_buff *ret = NULL; + + spin_lock(&qp->lock); + + ip_frag_queue(qp, skb); + + if (qp->last_in == (FIRST_IN|LAST_IN) && + qp->meat == qp->len) + ret = ip_frag_reasm(qp, dev); + + spin_unlock(&qp->lock); + ipq_put(qp, NULL); + return ret; + } + + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return NULL; +} + +void ipfrag_init(void) +{ + ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&ipfrag_secret_timer); + ipfrag_secret_timer.function = ipfrag_secret_rebuild; + ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval; + add_timer(&ipfrag_secret_timer); +} + +EXPORT_SYMBOL(ip_defrag); |